Files
NawfalMotii79-PLFM_RADAR/9_Firmware/9_2_FPGA/xfft_2048.v
T
Jason 5c8cc8c96a feat(fpga): swap matched-filter chain to Xilinx LogiCORE FFT v9.1 IP
Replaces the in-house iterative fft_engine.v in the matched-filter chain
with the Pipelined Streaming Xilinx FFT IP, closing RX-NEW-3 (FFT chain
~11x too slow vs PRI budget).

Components:
  * ip/xfft_2048_ip/xfft_2048_ip.xci — committed IP definition
    (16-bit fixed point, BFP scaling, convergent rounding, natural order,
    pipelined-streaming, BRAM data/reorder/phase factors). Vivado
    regenerates .dcp / sim-netlist from this on each build.
  * scripts/50t/gen_xfft_2048_ip.tcl — IP-Catalog generation script
  * scripts/50t/run_xfft_xsim.sh — XSim batch runner for tb_xfft_2048_xsim
  * xfft_2048.v — AXI-Stream wrapper. FFT_USE_XILINX_IP define routes to
    real LogiCORE for synth/XSim; falls back to fft_engine batched
    one-shot for iverilog (unit coverage only).
  * fft_engine_axi_bridge.v — exposes legacy fft_engine port surface on
    top of the xfft_2048 AXI wrapper, so the chain swap is a 1-line
    module-name change.
  * matched_filter_processing_chain.v — fft_engine -> fft_engine_axi_bridge
  * scripts/50t/build_50t.tcl — read_ip + generate_target + synth_ip;
    adds FFT_USE_XILINX_IP to verilog defines.
  * tb/tb_xfft_2048_xsim.v — XSim verification (DC, impulse, tone bin 128).
    All 5 assertions PASS on remote with the real IP; tuser=0x0a (BLK_EXP=10)
    confirms BFP scaling working.

Local iverilog regression: 32/34 PASS — identical to baseline. Same two
RX-NEW-3 failures (Receiver Integration, Matched Filter Chain) — these
only resolve in remote XSim with the real IP, since iverilog uses the
fft_engine fallback inside xfft_2048 (~150K cycles/pass, not the
~2200-cycle Pipelined Streaming throughput). MF cosim 4/4 PASS confirms
bridge bit-exact in fallback mode.

Pending: remote XSim of tb_radar_receiver_final to demonstrate Doppler
frames produced within PRI budget; remote synth to confirm DSP/timing
post-IP.
2026-04-23 12:39:33 +05:45

283 lines
10 KiB
Verilog

`timescale 1ns / 1ps
// ============================================================================
// xfft_2048.v — 2048-point FFT wrapper (Xilinx LogiCORE for synth/XSim,
// in-house fft_engine fallback for iverilog)
// ============================================================================
// AXI-Stream port list mirrors Xilinx LogiCORE Fast Fourier Transform v9.1
// (PG109). Two implementation branches selected by `FFT_USE_XILINX_IP`:
//
// `define FFT_USE_XILINX_IP → instantiates xfft_2048_ip (LogiCORE FFT v9.1)
// Pipelined Streaming I/O, BFP scaling, 16-bit.
// Use for: Vivado synth, remote XSim sim.
//
// `undef FFT_USE_XILINX_IP → instantiates fft_engine batched one-shot
// (collect N → compute → drain N).
// Use for: iverilog local sim only.
//
// Throughput on production silicon (Xilinx IP path): ~N + ~150 cycles per
// transform with full overlap → ~6600 cycles for 3 sequential transforms in
// the matched-filter chain, vs the 16700-cycle PRI budget. Closes RX-NEW-3.
//
// Data format: {Q[15:0], I[15:0]} packed 32-bit on s_axis/m_axis_data_tdata.
// Config tdata[0]: 1 = forward FFT, 0 = inverse FFT (matches PG109 convention).
//
// Block-FP scaling (Xilinx path only): per-frame BLK_EXP returned via
// m_axis_data_tuser[7:0] so chain-level normalization can rescale before
// magnitude compute. Sim path always returns tuser = 0 (no BFP).
// ============================================================================
module xfft_2048 (
input wire aclk,
input wire aresetn,
// Configuration channel (AXI-Stream slave). 8-bit tdata; only bit 0
// (FWD/INV) is decoded by the IP in BFP mode (no scale schedule).
input wire [7:0] s_axis_config_tdata,
input wire s_axis_config_tvalid,
output wire s_axis_config_tready,
// Data input channel (AXI-Stream slave)
input wire [31:0] s_axis_data_tdata,
input wire s_axis_data_tvalid,
input wire s_axis_data_tlast,
output wire s_axis_data_tready,
// Data output channel (AXI-Stream master)
output wire [31:0] m_axis_data_tdata,
output wire [7:0] m_axis_data_tuser, // BLK_EXP[7:0] (Xilinx path); 0 (sim)
output wire m_axis_data_tvalid,
output wire m_axis_data_tlast,
input wire m_axis_data_tready
);
`ifdef FFT_USE_XILINX_IP
// ============================================================================
// XILINX LOGICORE FFT v9.1 — production / XSim path
// ============================================================================
// Side-channels (status/event) are tied off here; if downstream needs them
// (e.g. for pipeline-stall debug), surface them through this wrapper.
wire [7:0] xfft_status_tdata;
wire xfft_status_tvalid;
xfft_2048_ip u_xfft (
.aclk (aclk),
.s_axis_config_tdata (s_axis_config_tdata),
.s_axis_config_tvalid (s_axis_config_tvalid),
.s_axis_config_tready (s_axis_config_tready),
.s_axis_data_tdata (s_axis_data_tdata),
.s_axis_data_tvalid (s_axis_data_tvalid),
.s_axis_data_tready (s_axis_data_tready),
.s_axis_data_tlast (s_axis_data_tlast),
.m_axis_data_tdata (m_axis_data_tdata),
.m_axis_data_tuser (m_axis_data_tuser),
.m_axis_data_tvalid (m_axis_data_tvalid),
.m_axis_data_tready (m_axis_data_tready),
.m_axis_data_tlast (m_axis_data_tlast),
.m_axis_status_tdata (xfft_status_tdata),
.m_axis_status_tvalid (xfft_status_tvalid),
.m_axis_status_tready (1'b1),
.event_frame_started (),
.event_tlast_unexpected (),
.event_tlast_missing (),
.event_status_channel_halt (),
.event_data_in_channel_halt (),
.event_data_out_channel_halt ()
);
`else
// ============================================================================
// FALLBACK — fft_engine batched one-shot (iverilog path only)
// ============================================================================
// Collect N samples → kick fft_engine → drain N samples. Throughput is
// ~N (collect) + ~160 K (compute) + ~N (drain). NOT representative of the
// real LogiCORE — used only for unit-level iverilog regression coverage.
// ============================================================================
localparam N = 2048;
localparam LOG2N = 11;
localparam CNT_W = LOG2N + 1;
localparam [2:0] S_IDLE = 3'd0,
S_FEED = 3'd1,
S_RUN = 3'd2,
S_OUTPUT = 3'd3;
reg [2:0] state;
reg inverse_reg;
(* ram_style = "block" *) reg signed [15:0] in_buf_re [0:N-1];
(* ram_style = "block" *) reg signed [15:0] in_buf_im [0:N-1];
(* ram_style = "block" *) reg signed [15:0] out_buf_re [0:N-1];
(* ram_style = "block" *) reg signed [15:0] out_buf_im [0:N-1];
reg [CNT_W-1:0] in_count;
reg [CNT_W-1:0] feed_count;
reg [CNT_W-1:0] out_total;
reg [CNT_W-1:0] out_count;
reg fft_start;
reg fft_inverse;
reg signed [15:0] fft_din_re, fft_din_im;
reg fft_din_valid;
wire signed [15:0] fft_dout_re, fft_dout_im;
wire fft_dout_valid;
wire fft_busy;
wire fft_done;
reg in_buf_we;
reg [LOG2N-1:0] in_buf_waddr;
reg signed [15:0] in_buf_wdata_re, in_buf_wdata_im;
reg out_buf_we;
reg [LOG2N-1:0] out_buf_waddr;
reg signed [15:0] out_buf_wdata_re, out_buf_wdata_im;
reg signed [15:0] out_rd_re, out_rd_im;
reg out_rd_valid;
fft_engine #(
.N(N), .LOG2N(LOG2N), .DATA_W(16), .INTERNAL_W(32),
.TWIDDLE_W(16), .TWIDDLE_FILE("fft_twiddle_2048.mem")
) fft_core (
.clk(aclk), .reset_n(aresetn),
.start(fft_start), .inverse(fft_inverse),
.din_re(fft_din_re), .din_im(fft_din_im), .din_valid(fft_din_valid),
.dout_re(fft_dout_re), .dout_im(fft_dout_im), .dout_valid(fft_dout_valid),
.busy(fft_busy), .done(fft_done)
);
assign s_axis_config_tready = (state == S_IDLE);
assign s_axis_data_tready = (state == S_FEED) && (in_count < N);
assign m_axis_data_tdata = {out_rd_im, out_rd_re};
assign m_axis_data_tuser = 8'h00; // No BFP in fallback path
assign m_axis_data_tvalid = out_rd_valid;
assign m_axis_data_tlast = out_rd_valid && (out_count == N);
always @(posedge aclk) begin
if (in_buf_we) begin
in_buf_re[in_buf_waddr] <= in_buf_wdata_re;
in_buf_im[in_buf_waddr] <= in_buf_wdata_im;
end
if (out_buf_we) begin
out_buf_re[out_buf_waddr] <= out_buf_wdata_re;
out_buf_im[out_buf_waddr] <= out_buf_wdata_im;
end
end
always @(posedge aclk or negedge aresetn) begin
if (!aresetn) begin
state <= S_IDLE;
inverse_reg <= 1'b0;
in_count <= 0;
feed_count <= 0;
out_total <= 0;
out_count <= 0;
fft_start <= 1'b0;
fft_inverse <= 1'b0;
fft_din_re <= 0;
fft_din_im <= 0;
fft_din_valid <= 1'b0;
in_buf_we <= 1'b0;
in_buf_waddr <= 0;
in_buf_wdata_re <= 0;
in_buf_wdata_im <= 0;
out_buf_we <= 1'b0;
out_buf_waddr <= 0;
out_buf_wdata_re <= 0;
out_buf_wdata_im <= 0;
out_rd_re <= 0;
out_rd_im <= 0;
out_rd_valid <= 1'b0;
end else begin
fft_start <= 1'b0;
fft_din_valid <= 1'b0;
in_buf_we <= 1'b0;
out_buf_we <= 1'b0;
case (state)
S_IDLE: begin
in_count <= 0;
feed_count <= 0;
out_total <= 0;
out_count <= 0;
out_rd_valid <= 1'b0;
if (s_axis_config_tvalid) begin
inverse_reg <= ~s_axis_config_tdata[0];
state <= S_FEED;
end
end
S_FEED: begin
if (in_count < N) begin
if (s_axis_data_tvalid) begin
in_buf_we <= 1'b1;
in_buf_waddr <= in_count[LOG2N-1:0];
in_buf_wdata_re <= s_axis_data_tdata[15:0];
in_buf_wdata_im <= s_axis_data_tdata[31:16];
in_count <= in_count + 1;
end
end else begin
fft_start <= 1'b1;
fft_inverse <= inverse_reg;
feed_count <= 0;
out_total <= 0;
state <= S_RUN;
end
end
S_RUN: begin
if (feed_count < N) begin
fft_din_re <= in_buf_re[feed_count[LOG2N-1:0]];
fft_din_im <= in_buf_im[feed_count[LOG2N-1:0]];
fft_din_valid <= 1'b1;
feed_count <= feed_count + 1;
end
if (fft_dout_valid && out_total < N) begin
out_buf_we <= 1'b1;
out_buf_waddr <= out_total[LOG2N-1:0];
out_buf_wdata_re <= fft_dout_re;
out_buf_wdata_im <= fft_dout_im;
out_total <= out_total + 1;
end
if (fft_done && out_total >= N) begin
state <= S_OUTPUT;
out_count <= 0;
out_rd_valid <= 1'b0;
end
end
S_OUTPUT: begin
if (m_axis_data_tready || !out_rd_valid) begin
if (out_count < N) begin
out_rd_re <= out_buf_re[out_count[LOG2N-1:0]];
out_rd_im <= out_buf_im[out_count[LOG2N-1:0]];
out_rd_valid <= 1'b1;
out_count <= out_count + 1;
end else begin
out_rd_valid <= 1'b0;
state <= S_IDLE;
end
end
end
default: state <= S_IDLE;
endcase
end
end
`ifdef SIMULATION
integer init_k;
initial begin
for (init_k = 0; init_k < N; init_k = init_k + 1) begin
in_buf_re[init_k] = 0;
in_buf_im[init_k] = 0;
out_buf_re[init_k] = 0;
out_buf_im[init_k] = 0;
end
end
`endif
`endif // FFT_USE_XILINX_IP
endmodule