mcu(health): poll PD15 + dispatch ERROR_FPGA_DSP_STALL (AUDIT-S10 follow-up)

AUDIT-S10 (commit `58154a6`) split the FPGA's six-flag aggregate
gpio_dig5 into two MCU-visible bits: gpio_dig5 keeps signal-saturation
(AGC reacts), gpio_dig7 (PD15) carries control-fault classes
(range_decim_watchdog | cic_fir_overrun). Until now the MCU did NOT
poll PD15, so DSP control faults were invisible to the recovery
dispatcher.

Changes:

- New `ERROR_FPGA_DSP_STALL` enum value placed AFTER ERROR_WATCHDOG_TIMEOUT
  so the dispatcher routes to attemptErrorRecovery (FPGA reset pulse) not
  Emergency_Stop. Updated error_strings[] in lockstep (static_assert
  enforces).

- checkSystemHealth section 10 polls PD15 at 1 Hz with 2-sample debounce.
  `last_dsp_check` is committed BEFORE the early return per AUDIT-CAL
  pattern, so a flapping fault never bypasses the rate-limit. Streak
  counter resets to 0 after firing (armed for next post-recovery
  assertion) AND resets naturally when PD15 returns LOW.

- attemptErrorRecovery: ERROR_FPGA_DSP_STALL fans into the existing
  ERROR_FPGA_COMM PD12 reset case (stacked case labels, same body). No
  MCU-driven reset_monitors path exists; full bitstream reload clears
  all sticky monitors as a side effect.

Tests:
- tests/test_audit_s10_dsp_stall_polling.c (NEW, 7 scenarios, 7/7 PASS):
  T1 healthy 60s, T2 single-sample glitch blocked by debounce, T3
  sustained fault fires once, T4 post-fire rate-limit holds within
  window, T5 sustained fault rate bounded (29 errors / 60s -- MCU-N1
  latch at error_count>10 fires in ~22s, gives operator time to
  intervene), T6 counter-test demos no-debounce false-positive on
  glitch, T7 HAL_GetTick 32-bit wrap.
- MCU host suite 35/35 PASS (was 34/34; +1 new, 0 regressions).
This commit is contained in:
Jason
2026-04-29 23:42:21 +05:45
parent 853d2a5fd9
commit 534905263f
3 changed files with 313 additions and 10 deletions
@@ -646,6 +646,12 @@ typedef enum {
ERROR_TEMPERATURE_HIGH,
ERROR_MEMORY_ALLOC,
ERROR_WATCHDOG_TIMEOUT,
/* AUDIT-S10 follow-up: gpio_dig7 (PD15) reports FPGA DSP control faults
* (range-decimator watchdog, CIC->FIR CDC overrun). Placed AFTER the
* ERROR_RF_PA_OVERCURRENT..ERROR_POWER_SUPPLY critical range so the
* dispatcher routes to attemptErrorRecovery (FPGA reset pulse) instead
* of Emergency_Stop. */
ERROR_FPGA_DSP_STALL,
ERROR_COUNT // must be last — used for bounds checking error_strings[]
} SystemError_t;
@@ -840,6 +846,42 @@ SystemError_t checkSystemHealth(void) {
// 9. Watchdog check is performed at function entry (see step 0).
// 10. AUDIT-S10 follow-up: FPGA DSP control-fault flag (PD15 / FPGA_DIG7).
// gpio_dig7 = (range_decim_watchdog | cic_fir_overrun), sticky in the
// FPGA source domain until a full bitstream reset clears it. Distinct
// from PD13 (signal-saturation; AGC reacts) — these are control-path
// stalls requiring an FPGA reset, not a gain change. AGC was previously
// mis-reacting to these on the old aggregated PD13 line.
//
// Recovery: attemptErrorRecovery(ERROR_FPGA_DSP_STALL) pulses PD12 to
// reload the bitstream — that clears all sticky monitors as a side
// effect (no MCU-driven reset_monitors path exists).
//
// Debounce: 2 consecutive HIGH samples on a 1 s cadence so a single
// glitch does not provoke an FPGA reset. Counter restarts at 0 once
// fired (and naturally resets when PD15 goes LOW after reset).
// last_dsp_check is committed BEFORE the early return per the
// AUDIT-CAL pattern, so a flapping fault never bypasses rate-limit.
static uint32_t last_dsp_check = 0;
static uint8_t dsp_stall_streak = 0;
if (HAL_GetTick() - last_dsp_check > 1000) {
last_dsp_check = HAL_GetTick();
bool dsp_fault = (HAL_GPIO_ReadPin(FPGA_DIG7_GPIO_Port,
FPGA_DIG7_Pin) == GPIO_PIN_SET);
if (dsp_fault) {
if (dsp_stall_streak < 2) dsp_stall_streak++;
} else {
dsp_stall_streak = 0;
}
if (dsp_stall_streak >= 2) {
dsp_stall_streak = 0; // arm for next assertion post-recovery
current_error = ERROR_FPGA_DSP_STALL;
DIAG_ERR("FPGA",
"Health check: gpio_dig7 (PD15) HIGH 2x — DSP control fault");
return current_error;
}
}
if (current_error != ERROR_NONE) {
DIAG_ERR("SYS", "checkSystemHealth returning error code %d", current_error);
}
@@ -908,15 +950,23 @@ void attemptErrorRecovery(SystemError_t error) {
break;
case ERROR_FPGA_COMM:
/* MCU-A6: FPGA stopped responding (USB-CDC silence, status timeout).
* Pulse the FPGA reset line on PD12 LOW->10 ms->HIGH (same pattern
* the boot sequence uses, line ~2733). Bitstream re-initializes
* from flash. We do NOT touch PA rails here — MCU-N2/N11 already
* sequences the cold-boot reset BEFORE PA Vdd, but at runtime the
* PAs are live and re-resetting the FPGA briefly leaves
* adar_tr_x undefined for ~10 ms. The trade-off is acceptable
* vs. losing the radar entirely; if the operator wants a
* power-cycle-clean recovery they can issue Emergency_Stop. */
case ERROR_FPGA_DSP_STALL:
/* MCU-A6 + AUDIT-S10 follow-up:
* ERROR_FPGA_COMM: FPGA stopped responding (USB-CDC silence,
* status timeout).
* ERROR_FPGA_DSP_STALL: gpio_dig7 (PD15) HIGH for 2+ samples;
* range_decim_watchdog or cic_fir_overrun is sticky in the
* FPGA source domain. No standalone MCU->FPGA reset_monitors
* path exists, so the cheapest recovery is a full bitstream
* reload — same PD12 pulse as ERROR_FPGA_COMM, which clears
* all sticky monitors as a side effect.
* Pulse FPGA reset line PD12 LOW->10 ms->HIGH (same pattern as
* boot sequence, line ~2733). Bitstream re-initializes from flash.
* We do NOT touch PA rails — MCU-N2/N11 sequences cold-boot reset
* BEFORE PA Vdd, but at runtime the PAs are live and re-resetting
* the FPGA briefly leaves adar_tr_x undefined for ~10 ms. The
* trade-off is acceptable vs losing the radar entirely; for a
* power-cycle-clean recovery the operator can issue Emergency_Stop. */
DIAG("FPGA", "Recovery: pulsing FPGA reset on PD12 (LOW for 10 ms)");
HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_RESET);
HAL_Delay(10);
@@ -1108,7 +1158,8 @@ void handleSystemError(SystemError_t error) {
"Power supply fault",
"System temperature high",
"Memory allocation failed",
"Watchdog timeout"
"Watchdog timeout",
"FPGA DSP control fault"
};
static_assert(sizeof(error_strings) / sizeof(error_strings[0]) == ERROR_COUNT,
@@ -75,6 +75,7 @@ TESTS_STANDALONE := test_bug12_pa_cal_loop_inverted \
test_audit_c17_bmp180_sentinel_and_cast \
test_audit_cal_bmp180_begin \
test_audit_imu_watchdog_cadence \
test_audit_s10_dsp_stall_polling \
test_gap3_iwdg_config \
test_gap3_temperature_max \
test_gap3_idq_periodic_reread \
@@ -195,6 +196,9 @@ test_audit_cal_bmp180_begin: test_audit_cal_bmp180_begin.c
test_audit_imu_watchdog_cadence: test_audit_imu_watchdog_cadence.c
$(CC) $(CFLAGS) $< -o $@
test_audit_s10_dsp_stall_polling: test_audit_s10_dsp_stall_polling.c
$(CC) $(CFLAGS) $< -o $@
# Gap-3 safety tests -- mock-only (needs spy log for GPIO sequence)
test_gap3_emergency_stop_rails: test_gap3_emergency_stop_rails.c $(MOCK_OBJS)
$(CC) $(CFLAGS) $(INCLUDES) $< $(MOCK_OBJS) -o $@
@@ -0,0 +1,248 @@
/*******************************************************************************
* test_audit_s10_dsp_stall_polling.c
*
* AUDIT-S10 follow-up: MCU-side polling of gpio_dig7 (PD15 / FPGA_DIG7).
*
* Background: AUDIT-S10 (commit `58154a6`) split the FPGA's six-flag aggregate
* gpio_dig5 into two MCU-visible bits: gpio_dig5 keeps signal-saturation only
* (AGC reacts) and gpio_dig7 (PD15) carries control-fault classes
* (range-decim watchdog | cic_fir overrun). Pre-follow-up the MCU did NOT
* poll PD15, so DSP control faults were invisible to the recovery dispatcher
* and accumulated until the operator noticed downstream symptoms.
*
* The post-fix predicate (matches checkSystemHealth section 10):
*
* static uint32_t last_dsp_check = 0;
* static uint8_t dsp_stall_streak = 0;
* if (HAL_GetTick() - last_dsp_check > 1000) {
* last_dsp_check = HAL_GetTick(); // commit BEFORE check
* bool fault = read_pd15();
* if (fault) { if (dsp_stall_streak < 2) dsp_stall_streak++; }
* else { dsp_stall_streak = 0; }
* if (dsp_stall_streak >= 2) {
* dsp_stall_streak = 0; // arm for next post-recovery
* return ERROR_FPGA_DSP_STALL;
* }
* }
* return ERROR_NONE;
*
* Test strategy:
* - Extract the post-fix predicate into a pure function.
* - Drive it with simulated HAL_GetTick() and a controllable PD15 mock.
* - Verify: rate-limit holds (1 Hz cadence), 2-sample debounce blocks
* glitches, sustained fault fires error exactly once per assertion,
* last_dsp_check committed on every fired-watchdog call (AUDIT-CAL
* pattern), and HAL_GetTick wrap is handled correctly.
* - Add a counter-test using a pre-fix-style "fire on first HIGH" predicate
* to demonstrate the glitch-driven false-positive class the debounce
* guards against.
******************************************************************************/
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
/* ---- Mock PD15 state ---- */
static bool pd15_high = false;
static int pd15_read_count = 0;
static bool mock_read_pd15(void)
{
pd15_read_count++;
return pd15_high;
}
/* ============================================================================
* Post-fix predicate (matches main.cpp section 10).
* Returns 1 iff this call raises ERROR_FPGA_DSP_STALL.
* ============================================================================ */
static uint32_t last_dsp_check_postfix = 0;
static uint8_t dsp_stall_streak_postfix = 0;
static int dsp_watchdog_postfix(uint32_t now_tick)
{
if (now_tick - last_dsp_check_postfix > 1000) {
last_dsp_check_postfix = now_tick; /* commit BEFORE check */
bool fault = mock_read_pd15();
if (fault) {
if (dsp_stall_streak_postfix < 2) dsp_stall_streak_postfix++;
} else {
dsp_stall_streak_postfix = 0;
}
if (dsp_stall_streak_postfix >= 2) {
dsp_stall_streak_postfix = 0; /* arm for next assertion */
return 1; /* ERROR_FPGA_DSP_STALL */
}
}
return 0;
}
/* ============================================================================
* Pre-fix-style predicate: fires on first HIGH read with no debounce. Kept as
* a counter-test to demonstrate the glitch-driven false-positive that the
* 2-sample debounce in the post-fix predicate guards against.
* ============================================================================ */
static uint32_t last_dsp_check_prefix = 0;
static int dsp_watchdog_prefix_no_debounce(uint32_t now_tick)
{
if (now_tick - last_dsp_check_prefix > 1000) {
last_dsp_check_prefix = now_tick;
if (mock_read_pd15()) {
return 1;
}
}
return 0;
}
/* ---- Test bookkeeping ---- */
static void reset_state(void)
{
last_dsp_check_postfix = 0;
dsp_stall_streak_postfix = 0;
last_dsp_check_prefix = 0;
pd15_high = false;
pd15_read_count = 0;
}
int main(void)
{
printf("=== AUDIT-S10 follow-up: PD15 polling + ERROR_FPGA_DSP_STALL ===\n");
/* ----------------------------------------------------------------
* T1: Healthy FPGA PD15 stays LOW no error across many windows.
* Drive 60 s of 10 ms-spaced ticks; expect 0 errors and streak=0.
* ---------------------------------------------------------------- */
printf(" T1 healthy FPGA (PD15 LOW) — 0 errors over 60 s... ");
reset_state();
int errors = 0;
for (int i = 0; i <= 6000; i++) {
errors += dsp_watchdog_postfix((uint32_t)(i * 10)); /* 0..60 s */
}
assert(errors == 0);
assert(dsp_stall_streak_postfix == 0);
/* Polling cadence: window crosses every 1 s; with > 1000 strict, fires
at t=1010, 2020, ..., 60060 across [0, 60000] inclusive that's 59 polls. */
assert(pd15_read_count == 59);
printf("PASS (polls=%d)\n", pd15_read_count);
/* ----------------------------------------------------------------
* T2: Single-sample glitch PD15 HIGH for 1 window only, LOW after.
* Debounce must block: streak hits 1 then resets, no error.
* ---------------------------------------------------------------- */
printf(" T2 single-sample glitch — debounce blocks... ");
reset_state();
/* Cross threshold once with PD15 HIGH (glitch). */
pd15_high = true;
int e = dsp_watchdog_postfix(1001);
assert(e == 0);
assert(dsp_stall_streak_postfix == 1);
/* Next window: PD15 back to LOW (glitch cleared). */
pd15_high = false;
e = dsp_watchdog_postfix(2002);
assert(e == 0);
assert(dsp_stall_streak_postfix == 0);
/* Many subsequent LOW windows — no error ever. */
for (uint32_t t = 3003; t < 60000; t += 1001) {
assert(dsp_watchdog_postfix(t) == 0);
}
printf("PASS\n");
/* ----------------------------------------------------------------
* T3: Sustained DSP fault PD15 HIGH for 2 consecutive windows.
* Expect: streak reaches 2, fires ERROR_FPGA_DSP_STALL on second poll.
* After fire, streak resets to 0 (armed for next post-recovery assertion).
* ---------------------------------------------------------------- */
printf(" T3 sustained fault (PD15 HIGH x2) — fires ERROR_FPGA_DSP_STALL... ");
reset_state();
pd15_high = true;
/* First poll after threshold — streak=1, no error. */
e = dsp_watchdog_postfix(1001);
assert(e == 0);
assert(dsp_stall_streak_postfix == 1);
/* Second poll — streak=2, fires error, then resets to 0. */
e = dsp_watchdog_postfix(2002);
assert(e == 1);
assert(dsp_stall_streak_postfix == 0);
/* last_dsp_check committed BEFORE return — must equal 2002 even though
we returned an error. Same AUDIT-CAL invariant as IMU watchdog. */
assert(last_dsp_check_postfix == 2002u);
printf("PASS\n");
/* ----------------------------------------------------------------
* T4: After fire, intra-window calls do NOT re-fire (rate-limit holds).
* ---------------------------------------------------------------- */
printf(" T4 post-fire rate-limit holds within window... ");
/* Continue from T3 state: t=2002, fault still HIGH, streak=0. */
/* Call again at t=2003 (only 1 ms after last poll) — under rate-limit,
must NOT poll PD15 again. */
int reads_before = pd15_read_count;
e = dsp_watchdog_postfix(2003);
assert(e == 0);
assert(pd15_read_count == reads_before); /* no PD15 read */
printf("PASS\n");
/* ----------------------------------------------------------------
* T5: Sustained fault error cadence is 1 per ~2 s (1 s window +
* 2-sample debounce). Across 60 s of continuous fault, expect bounded
* fire rate (NOT every iteration as the pre-fix path would).
* ---------------------------------------------------------------- */
printf(" T5 sustained fault — error rate bounded over 60 s... ");
reset_state();
pd15_high = true;
int total_errors = 0;
int total_calls = 0;
for (int i = 0; i <= 6000; i++) {
if (dsp_watchdog_postfix((uint32_t)(i * 10))) total_errors++;
total_calls++;
}
/* Polling at t=1010, 2020, ..., 60060 — 59 polls. Streak pattern:
1, 2(fire+reset to 0), 1, 2(fire+reset), ... so error fires every
2 polls. 59 polls / 2 = 29 errors (integer). Allow ±1 for boundary. */
assert(total_calls == 6001);
assert(total_errors >= 28 && total_errors <= 30);
/* MCU-N1 latch at error_count > 10: under sustained fault would fire
in ~22 s. That's acceptable gives operator time to intervene
before SAFE-MODE; bench-test should validate. Pre-fix without any
polling, this fault was MCU-invisible until downstream symptoms. */
printf("PASS (calls=%d errors=%d)\n", total_calls, total_errors);
/* ----------------------------------------------------------------
* T6: Counter-test no-debounce predicate fires on first HIGH window,
* even for a single-sample glitch. Demonstrates the false-positive class
* the post-fix 2-sample debounce guards against.
* ---------------------------------------------------------------- */
printf(" T6 counter-test: no-debounce predicate false-fires on glitch... ");
reset_state();
pd15_high = true;
/* Single HIGH glitch crosses threshold. */
e = dsp_watchdog_prefix_no_debounce(1001);
assert(e == 1); /* false positive — bug demo */
/* Glitch clears next window. */
pd15_high = false;
e = dsp_watchdog_prefix_no_debounce(2002);
assert(e == 0);
printf("PASS\n");
/* ----------------------------------------------------------------
* T7: HAL_GetTick() 32-bit wrap. Same modulo-arithmetic guarantee as
* test_audit_imu_watchdog_cadence T6 / test_gap3_health_watchdog_cold_start T8.
* ---------------------------------------------------------------- */
printf(" T7 HAL_GetTick wrap (0xFFFFFF00 -> 0x00000064)... ");
reset_state();
pd15_high = false;
/* Seed: prime the watchdog at t=0xFFFFFF00 (just before wrap). */
last_dsp_check_postfix = 0xFFFFFF00u;
/* Now ask at 0x00000064 — true elapsed = 0x164 = 356 ms, BELOW 1 s. */
int err = dsp_watchdog_postfix(0x00000064u);
assert(err == 0);
assert(pd15_read_count == 0); /* no poll */
/* Now jump >1 s past the wrap: 0x00000064 + 1001 = 0x0000044D. */
err = dsp_watchdog_postfix(0x0000044Du);
assert(err == 0); /* PD15 LOW */
assert(pd15_read_count == 1); /* one poll */
printf("PASS\n");
printf("\n=== AUDIT-S10 follow-up: ALL TESTS PASSED ===\n\n");
return 0;
}