feat(fpga): swap matched-filter chain to Xilinx LogiCORE FFT v9.1 IP

Replaces the in-house iterative fft_engine.v in the matched-filter chain
with the Pipelined Streaming Xilinx FFT IP, closing RX-NEW-3 (FFT chain
~11x too slow vs PRI budget).

Components:
  * ip/xfft_2048_ip/xfft_2048_ip.xci — committed IP definition
    (16-bit fixed point, BFP scaling, convergent rounding, natural order,
    pipelined-streaming, BRAM data/reorder/phase factors). Vivado
    regenerates .dcp / sim-netlist from this on each build.
  * scripts/50t/gen_xfft_2048_ip.tcl — IP-Catalog generation script
  * scripts/50t/run_xfft_xsim.sh — XSim batch runner for tb_xfft_2048_xsim
  * xfft_2048.v — AXI-Stream wrapper. FFT_USE_XILINX_IP define routes to
    real LogiCORE for synth/XSim; falls back to fft_engine batched
    one-shot for iverilog (unit coverage only).
  * fft_engine_axi_bridge.v — exposes legacy fft_engine port surface on
    top of the xfft_2048 AXI wrapper, so the chain swap is a 1-line
    module-name change.
  * matched_filter_processing_chain.v — fft_engine -> fft_engine_axi_bridge
  * scripts/50t/build_50t.tcl — read_ip + generate_target + synth_ip;
    adds FFT_USE_XILINX_IP to verilog defines.
  * tb/tb_xfft_2048_xsim.v — XSim verification (DC, impulse, tone bin 128).
    All 5 assertions PASS on remote with the real IP; tuser=0x0a (BLK_EXP=10)
    confirms BFP scaling working.

Local iverilog regression: 32/34 PASS — identical to baseline. Same two
RX-NEW-3 failures (Receiver Integration, Matched Filter Chain) — these
only resolve in remote XSim with the real IP, since iverilog uses the
fft_engine fallback inside xfft_2048 (~150K cycles/pass, not the
~2200-cycle Pipelined Streaming throughput). MF cosim 4/4 PASS confirms
bridge bit-exact in fallback mode.

Pending: remote XSim of tb_radar_receiver_final to demonstrate Doppler
frames produced within PRI budget; remote synth to confirm DSP/timing
post-IP.
This commit is contained in:
Jason
2026-04-23 12:39:33 +05:45
parent cc6691dec9
commit 5c8cc8c96a
10 changed files with 1256 additions and 6 deletions
+283
View File
@@ -0,0 +1,283 @@
`timescale 1ns / 1ps
// ============================================================================
// tb_xfft_2048_xsim.v XSim verification of xfft_2048 wrapper with real IP
// ============================================================================
// Compiled with `+define+FFT_USE_XILINX_IP` so the wrapper instantiates the
// LogiCORE FFT v9.1 (xfft_2048_ip). Cannot run in iverilog because that path
// uses Xilinx primitives (DSP48E1, BRAM18). For iverilog, leave the define
// off and the wrapper falls back to the fft_engine batched implementation.
//
// Three minimal stimuli:
// 1. DC (re=10000, im=0) peak bin = 0 with large magnitude;
// all other bins near zero.
// 2. Impulse (single sample (10000,0)) output magnitude flat across all bins
// (DFT of a delta = constant).
// 3. Tone (cos+jsin at bin K=128) peak bin = K with large magnitude;
// all other bins near zero.
//
// PASS criteria:
// - peak bin matches expected
// - peak magnitude > 8× mean of non-peak bins (analogous to receiver-chain
// SNR check that's been used elsewhere in this codebase)
// ============================================================================
module tb_xfft_2048_xsim;
localparam CLK_PERIOD = 10.0; // 100 MHz
localparam N = 2048;
localparam LOG2N = 11;
reg aclk = 0;
reg aresetn = 0;
reg [7:0] cfg_tdata;
reg cfg_tvalid;
wire cfg_tready;
reg [31:0] din_tdata;
reg din_tvalid;
reg din_tlast;
wire din_tready;
wire [31:0] dout_tdata;
wire [7:0] dout_tuser;
wire dout_tvalid;
wire dout_tlast;
reg dout_tready;
integer pass_count = 0;
integer fail_count = 0;
integer test_num = 0;
integer k;
integer out_idx;
integer peak_bin;
integer peak_mag;
integer mean_others;
integer mag_sum_others;
integer this_mag;
integer cur_re, cur_im;
// Capture the entire output frame
reg signed [15:0] out_re [0:N-1];
reg signed [15:0] out_im [0:N-1];
integer out_collected;
always #(CLK_PERIOD/2) aclk = ~aclk;
xfft_2048 dut (
.aclk (aclk),
.aresetn (aresetn),
.s_axis_config_tdata (cfg_tdata),
.s_axis_config_tvalid (cfg_tvalid),
.s_axis_config_tready (cfg_tready),
.s_axis_data_tdata (din_tdata),
.s_axis_data_tvalid (din_tvalid),
.s_axis_data_tlast (din_tlast),
.s_axis_data_tready (din_tready),
.m_axis_data_tdata (dout_tdata),
.m_axis_data_tuser (dout_tuser),
.m_axis_data_tvalid (dout_tvalid),
.m_axis_data_tlast (dout_tlast),
.m_axis_data_tready (dout_tready)
);
// Continuously capture output frame
always @(posedge aclk) begin
if (aresetn && dout_tvalid && dout_tready && out_collected < N) begin
out_re[out_collected] <= $signed(dout_tdata[15:0]);
out_im[out_collected] <= $signed(dout_tdata[31:16]);
out_collected <= out_collected + 1;
end
end
// ----------------------------------------------------------------
// Send config (FWD = bit 0 = 1)
// ----------------------------------------------------------------
task send_config;
input fwd;
begin
@(posedge aclk);
cfg_tdata <= {7'b0, fwd};
cfg_tvalid <= 1'b1;
@(posedge aclk);
while (!cfg_tready) @(posedge aclk);
@(posedge aclk);
cfg_tvalid <= 1'b0;
end
endtask
// ----------------------------------------------------------------
// Stream N samples; src=0 DC, 1 impulse, 2 tone (bin K=128)
// ----------------------------------------------------------------
task stream_frame;
input integer src;
integer i;
real arg;
integer re16, im16;
begin
out_collected = 0;
@(posedge aclk);
din_tvalid <= 1'b1;
for (i = 0; i < N; i = i + 1) begin
case (src)
0: begin re16 = 10000; im16 = 0; end
1: begin re16 = (i == 0) ? 10000 : 0; im16 = 0; end
2: begin
arg = 6.2831853 * 128.0 * i / N;
re16 = $rtoi(10000.0 * $cos(arg));
im16 = $rtoi(10000.0 * $sin(arg));
end
default: begin re16 = 0; im16 = 0; end
endcase
din_tdata <= {im16[15:0], re16[15:0]};
din_tlast <= (i == N-1);
@(posedge aclk);
while (!din_tready) @(posedge aclk);
end
din_tvalid <= 1'b0;
din_tlast <= 1'b0;
end
endtask
// ----------------------------------------------------------------
// Wait until the full output frame has been captured (out_collected == N)
// or a deadline elapses.
// ----------------------------------------------------------------
task wait_frame;
input integer max_cycles;
integer t;
begin
t = 0;
while (out_collected < N && t < max_cycles) begin
@(posedge aclk);
t = t + 1;
end
if (out_collected < N) begin
$display("[FAIL] Timed out collecting frame: got %0d / %0d after %0d cycles",
out_collected, N, t);
fail_count = fail_count + 1;
end
end
endtask
// ----------------------------------------------------------------
// Locate peak |Re|+|Im| bin in captured frame
// ----------------------------------------------------------------
task analyze_frame;
output integer pk_bin;
output integer pk_mag;
output integer mean_other;
integer i, mag, sum;
begin
pk_bin = 0;
pk_mag = 0;
sum = 0;
for (i = 0; i < N; i = i + 1) begin
mag = (out_re[i] < 0 ? -out_re[i] : out_re[i])
+ (out_im[i] < 0 ? -out_im[i] : out_im[i]);
if (mag > pk_mag) begin
pk_mag = mag;
pk_bin = i;
end
sum = sum + mag;
end
mean_other = (sum - pk_mag) / (N - 1);
end
endtask
task check;
input cond;
input [511:0] label;
begin
test_num = test_num + 1;
if (cond) begin
$display("[PASS] T%0d: %0s", test_num, label);
pass_count = pass_count + 1;
end else begin
$display("[FAIL] T%0d: %0s", test_num, label);
fail_count = fail_count + 1;
end
end
endtask
initial begin
$dumpfile("tb_xfft_2048_xsim.vcd");
$dumpvars(0, tb_xfft_2048_xsim);
cfg_tdata = 0;
cfg_tvalid = 0;
din_tdata = 0;
din_tvalid = 0;
din_tlast = 0;
dout_tready = 1; // Always accept output
out_collected = 0;
repeat (10) @(posedge aclk);
aresetn = 1'b1;
repeat (10) @(posedge aclk);
// ============================================================
// T1: DC stimulus expect peak at bin 0
// ============================================================
$display("\n--- DC stimulus ---");
send_config(1'b1);
stream_frame(0);
wait_frame(20000);
analyze_frame(peak_bin, peak_mag, mean_others);
$display(" peak_bin=%0d peak_mag=%0d mean_others=%0d tuser=0x%h",
peak_bin, peak_mag, mean_others, dout_tuser);
check(peak_bin == 0, "DC -> peak at bin 0");
check(peak_mag > 8 * mean_others + 1, "DC -> peak/mean > 8x");
// ============================================================
// T2: Impulse expect roughly flat magnitude
// ============================================================
$display("\n--- Impulse stimulus ---");
send_config(1'b1);
stream_frame(1);
wait_frame(20000);
analyze_frame(peak_bin, peak_mag, mean_others);
$display(" peak_bin=%0d peak_mag=%0d mean_others=%0d tuser=0x%h",
peak_bin, peak_mag, mean_others, dout_tuser);
// For an impulse at sample 0, |X[k]| is constant; peak/mean ratio
// close to 1. Allow up to 3x to account for bit-width quantization.
check(peak_mag < 3 * mean_others + 100,
"Impulse -> flat spectrum (peak < 3x mean)");
// ============================================================
// T3: Complex tone at bin 128 expect peak at bin 128
// ============================================================
$display("\n--- Tone (bin 128) stimulus ---");
send_config(1'b1);
stream_frame(2);
wait_frame(20000);
analyze_frame(peak_bin, peak_mag, mean_others);
$display(" peak_bin=%0d peak_mag=%0d mean_others=%0d tuser=0x%h",
peak_bin, peak_mag, mean_others, dout_tuser);
check(peak_bin == 128, "Tone -> peak at bin 128");
check(peak_mag > 8 * mean_others + 1, "Tone -> peak/mean > 8x");
$display("");
$display("============================================");
$display(" XFFT_2048 (Xilinx LogiCORE) XSim RESULTS");
$display(" PASSED: %0d / %0d", pass_count, test_num);
$display(" FAILED: %0d / %0d", fail_count, test_num);
if (fail_count == 0)
$display(" ** ALL TESTS PASSED **");
else
$display(" ** %0d TEST(S) FAILED **", fail_count);
$display("============================================");
#100;
$finish;
end
// Global timeout never let the sim run forever
initial begin
#2000000; // 2 ms
$display("[FAIL] Global timeout @ 2 ms");
$finish;
end
endmodule