diff --git a/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp b/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp index 21dfdc4..90c071f 100644 --- a/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp +++ b/9_Firmware/9_1_Microcontroller/9_1_3_C_Cpp_Code/main.cpp @@ -646,6 +646,12 @@ typedef enum { ERROR_TEMPERATURE_HIGH, ERROR_MEMORY_ALLOC, ERROR_WATCHDOG_TIMEOUT, + /* AUDIT-S10 follow-up: gpio_dig7 (PD15) reports FPGA DSP control faults + * (range-decimator watchdog, CIC->FIR CDC overrun). Placed AFTER the + * ERROR_RF_PA_OVERCURRENT..ERROR_POWER_SUPPLY critical range so the + * dispatcher routes to attemptErrorRecovery (FPGA reset pulse) instead + * of Emergency_Stop. */ + ERROR_FPGA_DSP_STALL, ERROR_COUNT // must be last — used for bounds checking error_strings[] } SystemError_t; @@ -840,6 +846,42 @@ SystemError_t checkSystemHealth(void) { // 9. Watchdog check is performed at function entry (see step 0). + // 10. AUDIT-S10 follow-up: FPGA DSP control-fault flag (PD15 / FPGA_DIG7). + // gpio_dig7 = (range_decim_watchdog | cic_fir_overrun), sticky in the + // FPGA source domain until a full bitstream reset clears it. Distinct + // from PD13 (signal-saturation; AGC reacts) — these are control-path + // stalls requiring an FPGA reset, not a gain change. AGC was previously + // mis-reacting to these on the old aggregated PD13 line. + // + // Recovery: attemptErrorRecovery(ERROR_FPGA_DSP_STALL) pulses PD12 to + // reload the bitstream — that clears all sticky monitors as a side + // effect (no MCU-driven reset_monitors path exists). + // + // Debounce: 2 consecutive HIGH samples on a 1 s cadence so a single + // glitch does not provoke an FPGA reset. Counter restarts at 0 once + // fired (and naturally resets when PD15 goes LOW after reset). + // last_dsp_check is committed BEFORE the early return per the + // AUDIT-CAL pattern, so a flapping fault never bypasses rate-limit. + static uint32_t last_dsp_check = 0; + static uint8_t dsp_stall_streak = 0; + if (HAL_GetTick() - last_dsp_check > 1000) { + last_dsp_check = HAL_GetTick(); + bool dsp_fault = (HAL_GPIO_ReadPin(FPGA_DIG7_GPIO_Port, + FPGA_DIG7_Pin) == GPIO_PIN_SET); + if (dsp_fault) { + if (dsp_stall_streak < 2) dsp_stall_streak++; + } else { + dsp_stall_streak = 0; + } + if (dsp_stall_streak >= 2) { + dsp_stall_streak = 0; // arm for next assertion post-recovery + current_error = ERROR_FPGA_DSP_STALL; + DIAG_ERR("FPGA", + "Health check: gpio_dig7 (PD15) HIGH 2x — DSP control fault"); + return current_error; + } + } + if (current_error != ERROR_NONE) { DIAG_ERR("SYS", "checkSystemHealth returning error code %d", current_error); } @@ -908,15 +950,23 @@ void attemptErrorRecovery(SystemError_t error) { break; case ERROR_FPGA_COMM: - /* MCU-A6: FPGA stopped responding (USB-CDC silence, status timeout). - * Pulse the FPGA reset line on PD12 LOW->10 ms->HIGH (same pattern - * the boot sequence uses, line ~2733). Bitstream re-initializes - * from flash. We do NOT touch PA rails here — MCU-N2/N11 already - * sequences the cold-boot reset BEFORE PA Vdd, but at runtime the - * PAs are live and re-resetting the FPGA briefly leaves - * adar_tr_x undefined for ~10 ms. The trade-off is acceptable - * vs. losing the radar entirely; if the operator wants a - * power-cycle-clean recovery they can issue Emergency_Stop. */ + case ERROR_FPGA_DSP_STALL: + /* MCU-A6 + AUDIT-S10 follow-up: + * ERROR_FPGA_COMM: FPGA stopped responding (USB-CDC silence, + * status timeout). + * ERROR_FPGA_DSP_STALL: gpio_dig7 (PD15) HIGH for 2+ samples; + * range_decim_watchdog or cic_fir_overrun is sticky in the + * FPGA source domain. No standalone MCU->FPGA reset_monitors + * path exists, so the cheapest recovery is a full bitstream + * reload — same PD12 pulse as ERROR_FPGA_COMM, which clears + * all sticky monitors as a side effect. + * Pulse FPGA reset line PD12 LOW->10 ms->HIGH (same pattern as + * boot sequence, line ~2733). Bitstream re-initializes from flash. + * We do NOT touch PA rails — MCU-N2/N11 sequences cold-boot reset + * BEFORE PA Vdd, but at runtime the PAs are live and re-resetting + * the FPGA briefly leaves adar_tr_x undefined for ~10 ms. The + * trade-off is acceptable vs losing the radar entirely; for a + * power-cycle-clean recovery the operator can issue Emergency_Stop. */ DIAG("FPGA", "Recovery: pulsing FPGA reset on PD12 (LOW for 10 ms)"); HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_RESET); HAL_Delay(10); @@ -1108,7 +1158,8 @@ void handleSystemError(SystemError_t error) { "Power supply fault", "System temperature high", "Memory allocation failed", - "Watchdog timeout" + "Watchdog timeout", + "FPGA DSP control fault" }; static_assert(sizeof(error_strings) / sizeof(error_strings[0]) == ERROR_COUNT, diff --git a/9_Firmware/9_1_Microcontroller/tests/Makefile b/9_Firmware/9_1_Microcontroller/tests/Makefile index e5631bb..92c4980 100644 --- a/9_Firmware/9_1_Microcontroller/tests/Makefile +++ b/9_Firmware/9_1_Microcontroller/tests/Makefile @@ -75,6 +75,7 @@ TESTS_STANDALONE := test_bug12_pa_cal_loop_inverted \ test_audit_c17_bmp180_sentinel_and_cast \ test_audit_cal_bmp180_begin \ test_audit_imu_watchdog_cadence \ + test_audit_s10_dsp_stall_polling \ test_gap3_iwdg_config \ test_gap3_temperature_max \ test_gap3_idq_periodic_reread \ @@ -195,6 +196,9 @@ test_audit_cal_bmp180_begin: test_audit_cal_bmp180_begin.c test_audit_imu_watchdog_cadence: test_audit_imu_watchdog_cadence.c $(CC) $(CFLAGS) $< -o $@ +test_audit_s10_dsp_stall_polling: test_audit_s10_dsp_stall_polling.c + $(CC) $(CFLAGS) $< -o $@ + # Gap-3 safety tests -- mock-only (needs spy log for GPIO sequence) test_gap3_emergency_stop_rails: test_gap3_emergency_stop_rails.c $(MOCK_OBJS) $(CC) $(CFLAGS) $(INCLUDES) $< $(MOCK_OBJS) -o $@ diff --git a/9_Firmware/9_1_Microcontroller/tests/test_audit_s10_dsp_stall_polling.c b/9_Firmware/9_1_Microcontroller/tests/test_audit_s10_dsp_stall_polling.c new file mode 100644 index 0000000..aeab753 --- /dev/null +++ b/9_Firmware/9_1_Microcontroller/tests/test_audit_s10_dsp_stall_polling.c @@ -0,0 +1,248 @@ +/******************************************************************************* + * test_audit_s10_dsp_stall_polling.c + * + * AUDIT-S10 follow-up: MCU-side polling of gpio_dig7 (PD15 / FPGA_DIG7). + * + * Background: AUDIT-S10 (commit `58154a6`) split the FPGA's six-flag aggregate + * gpio_dig5 into two MCU-visible bits: gpio_dig5 keeps signal-saturation only + * (AGC reacts) and gpio_dig7 (PD15) carries control-fault classes + * (range-decim watchdog | cic_fir overrun). Pre-follow-up the MCU did NOT + * poll PD15, so DSP control faults were invisible to the recovery dispatcher + * and accumulated until the operator noticed downstream symptoms. + * + * The post-fix predicate (matches checkSystemHealth section 10): + * + * static uint32_t last_dsp_check = 0; + * static uint8_t dsp_stall_streak = 0; + * if (HAL_GetTick() - last_dsp_check > 1000) { + * last_dsp_check = HAL_GetTick(); // commit BEFORE check + * bool fault = read_pd15(); + * if (fault) { if (dsp_stall_streak < 2) dsp_stall_streak++; } + * else { dsp_stall_streak = 0; } + * if (dsp_stall_streak >= 2) { + * dsp_stall_streak = 0; // arm for next post-recovery + * return ERROR_FPGA_DSP_STALL; + * } + * } + * return ERROR_NONE; + * + * Test strategy: + * - Extract the post-fix predicate into a pure function. + * - Drive it with simulated HAL_GetTick() and a controllable PD15 mock. + * - Verify: rate-limit holds (1 Hz cadence), 2-sample debounce blocks + * glitches, sustained fault fires error exactly once per assertion, + * last_dsp_check committed on every fired-watchdog call (AUDIT-CAL + * pattern), and HAL_GetTick wrap is handled correctly. + * - Add a counter-test using a pre-fix-style "fire on first HIGH" predicate + * to demonstrate the glitch-driven false-positive class the debounce + * guards against. + ******************************************************************************/ +#include +#include +#include +#include + +/* ---- Mock PD15 state ---- */ +static bool pd15_high = false; +static int pd15_read_count = 0; + +static bool mock_read_pd15(void) +{ + pd15_read_count++; + return pd15_high; +} + +/* ============================================================================ + * Post-fix predicate (matches main.cpp section 10). + * Returns 1 iff this call raises ERROR_FPGA_DSP_STALL. + * ============================================================================ */ +static uint32_t last_dsp_check_postfix = 0; +static uint8_t dsp_stall_streak_postfix = 0; + +static int dsp_watchdog_postfix(uint32_t now_tick) +{ + if (now_tick - last_dsp_check_postfix > 1000) { + last_dsp_check_postfix = now_tick; /* commit BEFORE check */ + bool fault = mock_read_pd15(); + if (fault) { + if (dsp_stall_streak_postfix < 2) dsp_stall_streak_postfix++; + } else { + dsp_stall_streak_postfix = 0; + } + if (dsp_stall_streak_postfix >= 2) { + dsp_stall_streak_postfix = 0; /* arm for next assertion */ + return 1; /* ERROR_FPGA_DSP_STALL */ + } + } + return 0; +} + +/* ============================================================================ + * Pre-fix-style predicate: fires on first HIGH read with no debounce. Kept as + * a counter-test to demonstrate the glitch-driven false-positive that the + * 2-sample debounce in the post-fix predicate guards against. + * ============================================================================ */ +static uint32_t last_dsp_check_prefix = 0; + +static int dsp_watchdog_prefix_no_debounce(uint32_t now_tick) +{ + if (now_tick - last_dsp_check_prefix > 1000) { + last_dsp_check_prefix = now_tick; + if (mock_read_pd15()) { + return 1; + } + } + return 0; +} + +/* ---- Test bookkeeping ---- */ +static void reset_state(void) +{ + last_dsp_check_postfix = 0; + dsp_stall_streak_postfix = 0; + last_dsp_check_prefix = 0; + pd15_high = false; + pd15_read_count = 0; +} + +int main(void) +{ + printf("=== AUDIT-S10 follow-up: PD15 polling + ERROR_FPGA_DSP_STALL ===\n"); + + /* ---------------------------------------------------------------- + * T1: Healthy FPGA — PD15 stays LOW → no error across many windows. + * Drive 60 s of 10 ms-spaced ticks; expect 0 errors and streak=0. + * ---------------------------------------------------------------- */ + printf(" T1 healthy FPGA (PD15 LOW) — 0 errors over 60 s... "); + reset_state(); + int errors = 0; + for (int i = 0; i <= 6000; i++) { + errors += dsp_watchdog_postfix((uint32_t)(i * 10)); /* 0..60 s */ + } + assert(errors == 0); + assert(dsp_stall_streak_postfix == 0); + /* Polling cadence: window crosses every 1 s; with > 1000 strict, fires + at t=1010, 2020, ..., 60060 — across [0, 60000] inclusive that's 59 polls. */ + assert(pd15_read_count == 59); + printf("PASS (polls=%d)\n", pd15_read_count); + + /* ---------------------------------------------------------------- + * T2: Single-sample glitch — PD15 HIGH for 1 window only, LOW after. + * Debounce must block: streak hits 1 then resets, no error. + * ---------------------------------------------------------------- */ + printf(" T2 single-sample glitch — debounce blocks... "); + reset_state(); + /* Cross threshold once with PD15 HIGH (glitch). */ + pd15_high = true; + int e = dsp_watchdog_postfix(1001); + assert(e == 0); + assert(dsp_stall_streak_postfix == 1); + /* Next window: PD15 back to LOW (glitch cleared). */ + pd15_high = false; + e = dsp_watchdog_postfix(2002); + assert(e == 0); + assert(dsp_stall_streak_postfix == 0); + /* Many subsequent LOW windows — no error ever. */ + for (uint32_t t = 3003; t < 60000; t += 1001) { + assert(dsp_watchdog_postfix(t) == 0); + } + printf("PASS\n"); + + /* ---------------------------------------------------------------- + * T3: Sustained DSP fault — PD15 HIGH for 2 consecutive windows. + * Expect: streak reaches 2, fires ERROR_FPGA_DSP_STALL on second poll. + * After fire, streak resets to 0 (armed for next post-recovery assertion). + * ---------------------------------------------------------------- */ + printf(" T3 sustained fault (PD15 HIGH x2) — fires ERROR_FPGA_DSP_STALL... "); + reset_state(); + pd15_high = true; + /* First poll after threshold — streak=1, no error. */ + e = dsp_watchdog_postfix(1001); + assert(e == 0); + assert(dsp_stall_streak_postfix == 1); + /* Second poll — streak=2, fires error, then resets to 0. */ + e = dsp_watchdog_postfix(2002); + assert(e == 1); + assert(dsp_stall_streak_postfix == 0); + /* last_dsp_check committed BEFORE return — must equal 2002 even though + we returned an error. Same AUDIT-CAL invariant as IMU watchdog. */ + assert(last_dsp_check_postfix == 2002u); + printf("PASS\n"); + + /* ---------------------------------------------------------------- + * T4: After fire, intra-window calls do NOT re-fire (rate-limit holds). + * ---------------------------------------------------------------- */ + printf(" T4 post-fire rate-limit holds within window... "); + /* Continue from T3 state: t=2002, fault still HIGH, streak=0. */ + /* Call again at t=2003 (only 1 ms after last poll) — under rate-limit, + must NOT poll PD15 again. */ + int reads_before = pd15_read_count; + e = dsp_watchdog_postfix(2003); + assert(e == 0); + assert(pd15_read_count == reads_before); /* no PD15 read */ + printf("PASS\n"); + + /* ---------------------------------------------------------------- + * T5: Sustained fault — error cadence is 1 per ~2 s (1 s window + + * 2-sample debounce). Across 60 s of continuous fault, expect bounded + * fire rate (NOT every iteration as the pre-fix path would). + * ---------------------------------------------------------------- */ + printf(" T5 sustained fault — error rate bounded over 60 s... "); + reset_state(); + pd15_high = true; + int total_errors = 0; + int total_calls = 0; + for (int i = 0; i <= 6000; i++) { + if (dsp_watchdog_postfix((uint32_t)(i * 10))) total_errors++; + total_calls++; + } + /* Polling at t=1010, 2020, ..., 60060 — 59 polls. Streak pattern: + 1, 2(fire+reset to 0), 1, 2(fire+reset), ... so error fires every + 2 polls. 59 polls / 2 = 29 errors (integer). Allow ±1 for boundary. */ + assert(total_calls == 6001); + assert(total_errors >= 28 && total_errors <= 30); + /* MCU-N1 latch at error_count > 10: under sustained fault would fire + in ~22 s. That's acceptable — gives operator time to intervene + before SAFE-MODE; bench-test should validate. Pre-fix without any + polling, this fault was MCU-invisible until downstream symptoms. */ + printf("PASS (calls=%d errors=%d)\n", total_calls, total_errors); + + /* ---------------------------------------------------------------- + * T6: Counter-test — no-debounce predicate fires on first HIGH window, + * even for a single-sample glitch. Demonstrates the false-positive class + * the post-fix 2-sample debounce guards against. + * ---------------------------------------------------------------- */ + printf(" T6 counter-test: no-debounce predicate false-fires on glitch... "); + reset_state(); + pd15_high = true; + /* Single HIGH glitch crosses threshold. */ + e = dsp_watchdog_prefix_no_debounce(1001); + assert(e == 1); /* false positive — bug demo */ + /* Glitch clears next window. */ + pd15_high = false; + e = dsp_watchdog_prefix_no_debounce(2002); + assert(e == 0); + printf("PASS\n"); + + /* ---------------------------------------------------------------- + * T7: HAL_GetTick() 32-bit wrap. Same modulo-arithmetic guarantee as + * test_audit_imu_watchdog_cadence T6 / test_gap3_health_watchdog_cold_start T8. + * ---------------------------------------------------------------- */ + printf(" T7 HAL_GetTick wrap (0xFFFFFF00 -> 0x00000064)... "); + reset_state(); + pd15_high = false; + /* Seed: prime the watchdog at t=0xFFFFFF00 (just before wrap). */ + last_dsp_check_postfix = 0xFFFFFF00u; + /* Now ask at 0x00000064 — true elapsed = 0x164 = 356 ms, BELOW 1 s. */ + int err = dsp_watchdog_postfix(0x00000064u); + assert(err == 0); + assert(pd15_read_count == 0); /* no poll */ + /* Now jump >1 s past the wrap: 0x00000064 + 1001 = 0x0000044D. */ + err = dsp_watchdog_postfix(0x0000044Du); + assert(err == 0); /* PD15 LOW */ + assert(pd15_read_count == 1); /* one poll */ + printf("PASS\n"); + + printf("\n=== AUDIT-S10 follow-up: ALL TESTS PASSED ===\n\n"); + return 0; +}