fix(mcu): MCU-A6 — recovery handlers for AD9523_CLOCK and FPGA_COMM

attemptErrorRecovery() previously fell through to the default log-only
branch for both ERROR_AD9523_CLOCK and ERROR_FPGA_COMM. checkSystemHealth
keeps re-firing the same error every pass with no recovery action ever
attempted, so the system limps along until escalation kicks in.

ERROR_AD9523_CLOCK: AD9523_RESET_ASSERT, 10 ms settle, then re-run
configure_ad9523() (releases reset, selects REFB, reprograms, waits for
lock). On second failure we log and let the next health pass re-fire so
a transient brown-out on the 100 MHz reference does not drop straight
into Emergency_Stop.

ERROR_FPGA_COMM: pulse PD12 LOW->10 ms->HIGH (matches the boot reset
pattern). PA rails left untouched at runtime; brief adar_tr_x undefined
window is acceptable vs. losing the radar entirely.

Added test_mcu_a6_recovery_dispatch (11 cases) covering both new
handlers, all existing routes, the default branch, a pre-fix regression
check, and an explicit assertion that RF_PA_OVERCURRENT escalates
upstream (handleSystemError) rather than recovering inline. MCU
regression now 80/80.
This commit is contained in:
Jason
2026-04-28 09:26:35 +05:45
parent 1317a91e01
commit 4a102e30fe
3 changed files with 172 additions and 0 deletions
@@ -841,6 +841,44 @@ void attemptErrorRecovery(SystemError_t error) {
DIAG("SYS", "Recovery: GPS error -- no action (auto-recover on signal)");
break;
case ERROR_AD9523_CLOCK:
/* MCU-A6: AD9523 lost lock (STATUS0/1 LOW). Assert reset, allow
* the chip to settle, then re-run the full configure path —
* configure_ad9523() releases reset, selects REFB, and reprograms
* registers + waits for lock. If the second attempt also fails
* the next health-check pass re-fires ERROR_AD9523_CLOCK and
* downstream policy (handleSystemError) decides whether to
* escalate; we deliberately do NOT escalate inline so a single
* transient brown-out on the 100 MHz reference does not drop
* straight into Emergency_Stop. */
DIAG("CLK", "Recovery: asserting AD9523 reset");
AD9523_RESET_ASSERT();
HAL_Delay(10);
DIAG("CLK", "Recovery: re-running configure_ad9523()");
if (configure_ad9523() != 0) {
DIAG_ERR("CLK", "Recovery: configure_ad9523() FAILED -- next health check will re-fire");
} else {
DIAG("CLK", "Recovery: AD9523 re-configure complete (lock pending verification)");
}
break;
case ERROR_FPGA_COMM:
/* MCU-A6: FPGA stopped responding (USB-CDC silence, status timeout).
* Pulse the FPGA reset line on PD12 LOW->10 ms->HIGH (same pattern
* the boot sequence uses, line ~2733). Bitstream re-initializes
* from flash. We do NOT touch PA rails here — MCU-N2/N11 already
* sequences the cold-boot reset BEFORE PA Vdd, but at runtime the
* PAs are live and re-resetting the FPGA briefly leaves
* adar_tr_x undefined for ~10 ms. The trade-off is acceptable
* vs. losing the radar entirely; if the operator wants a
* power-cycle-clean recovery they can issue Emergency_Stop. */
DIAG("FPGA", "Recovery: pulsing FPGA reset on PD12 (LOW for 10 ms)");
HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_RESET);
HAL_Delay(10);
HAL_GPIO_WritePin(GPIOD, GPIO_PIN_12, GPIO_PIN_SET);
DIAG("FPGA", "Recovery: FPGA reset released -- bitstream reload in progress");
break;
default:
// For other errors, just log and continue
DIAG_WARN("SYS", "Recovery: No specific handler for error %d", error);
@@ -69,6 +69,7 @@ TESTS_STANDALONE := test_bug12_pa_cal_loop_inverted \
test_mcu_a1_cooling_hysteresis \
test_mcu_a7_emergency_persist \
test_mcu_a5_pa_cal_gate \
test_mcu_a6_recovery_dispatch \
test_gap3_iwdg_config \
test_gap3_temperature_max \
test_gap3_idq_periodic_reread \
@@ -171,6 +172,9 @@ test_mcu_a7_emergency_persist: test_mcu_a7_emergency_persist.c
test_mcu_a5_pa_cal_gate: test_mcu_a5_pa_cal_gate.c
$(CC) $(CFLAGS) $< -o $@
test_mcu_a6_recovery_dispatch: test_mcu_a6_recovery_dispatch.c
$(CC) $(CFLAGS) $< -o $@
# Gap-3 safety tests -- mock-only (needs spy log for GPIO sequence)
test_gap3_emergency_stop_rails: test_gap3_emergency_stop_rails.c $(MOCK_OBJS)
$(CC) $(CFLAGS) $(INCLUDES) $< $(MOCK_OBJS) -o $@
@@ -0,0 +1,130 @@
/*******************************************************************************
* test_mcu_a6_recovery_dispatch.c
*
* MCU-A6: attemptErrorRecovery() previously had no case for
* ERROR_AD9523_CLOCK or ERROR_FPGA_COMM both fell through to the
* default DIAG_WARN("No specific handler") branch. checkSystemHealth()
* keeps re-firing the same error every pass, the recovery never advances,
* and the system reaches whatever escalation threshold is wired in
* handleSystemError without ever attempting a fix.
*
* Production fix adds:
* - ERROR_AD9523_CLOCK: AD9523_RESET_ASSERT, 10 ms, configure_ad9523()
* - ERROR_FPGA_COMM: pulse PD12 LOW->10 ms->HIGH (matches boot reset)
*
* This test models the dispatch table and asserts each error code routes
* to the expected handler (including the existing TX/RX/ADAR/IMU/GPS
* paths so a future regression that drops one is caught here).
******************************************************************************/
#include <assert.h>
#include <stdio.h>
#include <string.h>
typedef enum {
ERR_NONE,
ERR_AD9523_CLOCK,
ERR_ADF4382_TX_UNLOCK,
ERR_ADF4382_RX_UNLOCK,
ERR_ADAR1000_COMM,
ERR_ADAR1000_TEMP,
ERR_IMU_COMM,
ERR_BMP180_COMM,
ERR_GPS_COMM,
ERR_RF_PA_OVERCURRENT,
ERR_FPGA_COMM,
ERR_OTHER,
} Err_t;
typedef enum {
HND_NONE,
HND_AD9523_RESET_AND_RECONFIG,
HND_LO_REINIT,
HND_ADAR_REINIT,
HND_IMU_REINIT,
HND_GPS_NOOP,
HND_FPGA_RESET_PULSE,
HND_DEFAULT_LOG,
} Handler_t;
/* Mirrors main.cpp:attemptErrorRecovery() switch dispatch */
static Handler_t dispatch(Err_t error)
{
switch (error) {
case ERR_ADF4382_TX_UNLOCK:
case ERR_ADF4382_RX_UNLOCK:
return HND_LO_REINIT;
case ERR_ADAR1000_COMM:
return HND_ADAR_REINIT;
case ERR_IMU_COMM:
return HND_IMU_REINIT;
case ERR_GPS_COMM:
return HND_GPS_NOOP;
case ERR_AD9523_CLOCK: /* MCU-A6 new */
return HND_AD9523_RESET_AND_RECONFIG;
case ERR_FPGA_COMM: /* MCU-A6 new */
return HND_FPGA_RESET_PULSE;
default:
return HND_DEFAULT_LOG;
}
}
int main(void)
{
printf("=== MCU-A6: attemptErrorRecovery dispatch coverage ===\n");
/* MCU-A6 new cases ------------------------------------------------ */
printf(" Test 1: ERR_AD9523_CLOCK -> reset+reconfig ... ");
assert(dispatch(ERR_AD9523_CLOCK) == HND_AD9523_RESET_AND_RECONFIG);
printf("PASS\n");
printf(" Test 2: ERR_FPGA_COMM -> PD12 pulse ... ");
assert(dispatch(ERR_FPGA_COMM) == HND_FPGA_RESET_PULSE);
printf("PASS\n");
/* Existing handlers must still route correctly ------------------- */
printf(" Test 3: ERR_ADF4382_TX_UNLOCK -> LO re-init ... ");
assert(dispatch(ERR_ADF4382_TX_UNLOCK) == HND_LO_REINIT);
printf("PASS\n");
printf(" Test 4: ERR_ADF4382_RX_UNLOCK -> LO re-init ... ");
assert(dispatch(ERR_ADF4382_RX_UNLOCK) == HND_LO_REINIT);
printf("PASS\n");
printf(" Test 5: ERR_ADAR1000_COMM -> ADAR re-init ... ");
assert(dispatch(ERR_ADAR1000_COMM) == HND_ADAR_REINIT);
printf("PASS\n");
printf(" Test 6: ERR_IMU_COMM -> IMU re-init ... ");
assert(dispatch(ERR_IMU_COMM) == HND_IMU_REINIT);
printf("PASS\n");
printf(" Test 7: ERR_GPS_COMM -> auto-recover (no-op) ... ");
assert(dispatch(ERR_GPS_COMM) == HND_GPS_NOOP);
printf("PASS\n");
/* Default branch for un-handled codes ---------------------------- */
printf(" Test 8: ERR_BMP180_COMM -> default log ... ");
assert(dispatch(ERR_BMP180_COMM) == HND_DEFAULT_LOG);
printf("PASS\n");
printf(" Test 9: ERR_ADAR1000_TEMP -> default log ... ");
assert(dispatch(ERR_ADAR1000_TEMP) == HND_DEFAULT_LOG);
printf("PASS\n");
/* Pre-fix regression — without MCU-A6, AD9523_CLOCK and FPGA_COMM
* fell into HND_DEFAULT_LOG. Confirm fixed dispatch does NOT. */
printf(" Test 10: pre-fix would log default for AD9523/FPGA ... ");
assert(dispatch(ERR_AD9523_CLOCK) != HND_DEFAULT_LOG);
assert(dispatch(ERR_FPGA_COMM) != HND_DEFAULT_LOG);
printf("fixed dispatch routes both, PASS\n");
/* RF_PA_OVERCURRENT is intentionally NOT in attemptErrorRecovery
* because handleSystemError escalates it directly to Emergency_Stop
* (main.cpp:944-957). Document via test. */
printf(" Test 11: ERR_RF_PA_OVERCURRENT -> default (escalated upstream) ... ");
assert(dispatch(ERR_RF_PA_OVERCURRENT) == HND_DEFAULT_LOG);
printf("PASS\n");
printf("\n=== MCU-A6: ALL TESTS PASSED ===\n\n");
return 0;
}