Files
NawfalMotii79-PLFM_RADAR/9_Firmware/9_2_FPGA/fir_lowpass.v
T
Jason 977434a5f6 docs(fpga): correct fir_lowpass.v rate comment + flag rate/coeff mismatch
The header had two claims that "valid samples arrive every ~4 cycles" at
the FIR boundary. That is false in the production wiring: the CIC `_4x`
decimator turns clk_400m into a 100 M-pulse-per-second stream, then
cdc_adc_to_processing crosses that into clk_100m where dst_valid asserts
every cycle in steady state. The 4:1 ratio applies between the two clock
domains, not as further sub-sampling inside clk_100m.

This matters because the 32-tap coefficients were designed for the
25 MSPS rate the wrong comment described, but the FIR is actually being
driven at 100 MSPS. The cutoff sits 4x higher than intended; existing
tests pass because the 36-bit accumulator silently wraps on large
sustained inputs (see RX-NEW-3 in the project ledger).

Comment-only commit. No RTL behaviour change. Any future DSP-saving
rework — symmetric pre-adder, 4:1 fold, Xilinx FIR Compiler — needs a
designer call on whether to redesign coefficients for 100 MSPS, add a
real decimation stage to hit 25 MSPS, or keep the current accidental
behaviour.
2026-04-23 09:26:23 +05:45

329 lines
14 KiB
Verilog
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
`timescale 1ns / 1ps
module fir_lowpass_parallel_enhanced (
input wire clk,
input wire reset_n,
input wire signed [17:0] data_in,
input wire data_valid,
output reg signed [17:0] data_out,
output reg data_out_valid,
output wire fir_ready,
output wire filter_overflow
);
parameter TAPS = 32;
parameter COEFF_WIDTH = 18;
parameter DATA_WIDTH = 18;
parameter ACCUM_WIDTH = 36;
// ============================================================================
// Pipelined FIR filter for 100 MHz timing closure
//
// Problem: The original fully-combinatorial adder tree for 32 multiply products
// created a 31-deep DSP48E1 PCOUT cascade chain taking 56.6ns (WNS = -48.325ns).
//
// Solution: 5-stage pipelined binary adder tree with registered outputs at
// each level, plus BREG (coefficient register) and MREG (multiply output
// register) stages for DSP48E1 absorption. Each stage performs at most one
// pairwise addition (~1.7ns DSP hop), easily fitting in the 10ns clock period.
//
// Pipeline stages:
// Cycle 0: data_valid → shift delay line + latch coefficients (BREG)
// Cycle 1: Combinatorial multiply latched into mult_reg (MREG)
// Cycle 2: 16 pairwise sums of 32 multiply results (level 0)
// Cycle 3: 8 pairwise sums (level 1)
// Cycle 4: 4 pairwise sums (level 2)
// Cycle 5: 2 pairwise sums (level 3)
// Cycle 6: 1 final sum → accumulator_reg (level 4)
// Cycle 7: Output saturation/rounding
//
// Total latency: 9 cycles from data_valid to data_out_valid
// (was 7 before BREG+MREG addition — +2 cycles for DSP48 pipelining)
// Throughput: 1 sample per cycle (fully pipelined)
//
// Input rate: 100 MSPS at clk_100m (data_valid asserted EVERY cycle in
// steady state). The CIC `_4x` decimator drops 4:1 from clk_400m → 100 M
// pulses/s, then the cdc_adc_to_processing CDC into clk_100m emits one
// dst_valid per dst_clk cycle (Gray-toggle handshake at matching rate).
// A previous comment here claimed "samples arrive every ~4 cycles"; that
// was wrong — the 4:1 ratio applies between clk_400m and clk_100m, not as
// further sub-sampling within clk_100m.
//
// Implication: this 32-tap FIR's cutoff was designed for a 25 MSPS rate
// (the post-decimation rate that "every 4 cycles" would imply). Running
// at 100 MSPS shifts the effective cutoff 4× above that. The system's
// existing tests pass because the 36-bit accumulator silently wraps on
// large sustained inputs (giving lowpass-like behaviour by accident) —
// see the build report and the open RX-NEW-3 design question. Any future
// DSP-saving rework of this module needs a designer call on the
// rate-vs-coefficient mismatch.
// ============================================================================
// Filter coefficients (symmetric: coeff[k] == coeff[31-k])
reg signed [COEFF_WIDTH-1:0] coeff [0:TAPS-1];
// Parallel delay line
reg signed [DATA_WIDTH-1:0] delay_line [0:TAPS-1];
// Parallel multiply results (combinatorial)
wire signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_result [0:TAPS-1];
// Pipelined adder tree registers
// Level 0: 16 pairwise sums of 32 products
reg signed [ACCUM_WIDTH-1:0] add_l0 [0:15];
// Level 1: 8 pairwise sums
// USE_DSP="no" forces pure additions to fabric CARRY4 chains, freeing DSP48E1
// slices for the FFT butterfly multipliers that otherwise spill to 18-level
// fabric carry chains causing timing violations on the XC7A50T (120 DSP budget).
(* USE_DSP = "no" *) reg signed [ACCUM_WIDTH-1:0] add_l1 [0:7];
// Level 2: 4 pairwise sums
(* USE_DSP = "no" *) reg signed [ACCUM_WIDTH-1:0] add_l2 [0:3];
// Level 3: 2 pairwise sums
(* USE_DSP = "no" *) reg signed [ACCUM_WIDTH-1:0] add_l3 [0:1];
// Level 4: final sum
(* USE_DSP = "no" *) reg signed [ACCUM_WIDTH-1:0] accumulator_reg;
// Valid pipeline: 9-stage shift register (was 7 before BREG+MREG addition)
// [0]=BREG done, [1]=MREG done, [2]=L0 done, [3]=L1 done, [4]=L2 done,
// [5]=L3 done, [6]=L4/accum done, [7]=output done, [8]=spare
// The BREG and MREG stages add 2 cycles of latency.
reg [8:0] valid_pipe;
// Initialize coefficients
initial begin
// Proper low-pass filter coefficients
coeff[ 0] = 18'sh00AD; coeff[ 1] = 18'sh00CE; coeff[ 2] = 18'sh3FD87; coeff[ 3] = 18'sh02A6;
coeff[ 4] = 18'sh00E0; coeff[ 5] = 18'sh3F8C0; coeff[ 6] = 18'sh0A45; coeff[ 7] = 18'sh3FD82;
coeff[ 8] = 18'sh3F0B5; coeff[ 9] = 18'sh1CAD; coeff[10] = 18'sh3EE59; coeff[11] = 18'sh3E821;
coeff[12] = 18'sh4841; coeff[13] = 18'sh3B340; coeff[14] = 18'sh3E299; coeff[15] = 18'sh1FFFF;
coeff[16] = 18'sh1FFFF; coeff[17] = 18'sh3E299; coeff[18] = 18'sh3B340; coeff[19] = 18'sh4841;
coeff[20] = 18'sh3E821; coeff[21] = 18'sh3EE59; coeff[22] = 18'sh1CAD; coeff[23] = 18'sh3F0B5;
coeff[24] = 18'sh3FD82; coeff[25] = 18'sh0A45; coeff[26] = 18'sh3F8C0; coeff[27] = 18'sh00E0;
coeff[28] = 18'sh02A6; coeff[29] = 18'sh3FD87; coeff[30] = 18'sh00CE; coeff[31] = 18'sh00AD;
end
// ============================================================================
// DSP48E1 PIPELINE REGISTERS (BREG + MREG)
// ============================================================================
// Vivado DRC warnings DPIP-1 (input not pipelined) and DPOP-2 (output not
// pipelined) indicate that the DSP48E1 internal BREG and MREG pipeline stages
// are not being used.
//
// Solution: Add explicit registered stages that Vivado can absorb into DSP48E1:
// BREG: coeff_reg[k] — registered copy of coeff[k], feeds DSP48 B-port
// MREG: mult_reg[k] — registered multiply output, feeds DSP48 P-port
//
// With these registers, Vivado sets BREG=1 and MREG=1 inside DSP48E1,
// eliminating 68 DPIP-1 + 35 DPOP-2 warnings and improving timing.
//
// Pipeline impact: +2 cycles latency (BREG + MREG). Total FIR latency
// goes from 7 to 9 cycles. Transparent relative to downstream processing
// (Doppler/MTI operate on accumulated frames, not per-sample).
// ============================================================================
// Registered coefficients (BREG — absorbed into DSP48E1 B-port register)
reg signed [COEFF_WIDTH-1:0] coeff_reg [0:TAPS-1];
// Registered multiply outputs (MREG — absorbed into DSP48E1 M-register)
reg signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_reg [0:TAPS-1];
// Combinatorial multiply (between BREG and MREG)
wire signed [DATA_WIDTH+COEFF_WIDTH-1:0] mult_comb [0:TAPS-1];
genvar k;
generate
for (k = 0; k < TAPS; k = k + 1) begin : mult_gen
assign mult_comb[k] = delay_line[k] * coeff_reg[k];
end
endgenerate
// mult_result now comes from the registered multiply output (MREG stage)
genvar m;
generate
for (m = 0; m < TAPS; m = m + 1) begin : mult_alias
assign mult_result[m] = mult_reg[m];
end
endgenerate
integer i;
// ============================================================================
// Pipeline Stage 0: Shift delay line on data_valid
// Sync reset: enables DSP48E1 AREG/BREG absorption for delay_line registers
// feeding the multipliers. Async reset (FDCE) prevented Vivado from using
// the DSP48E1 internal A/B pipeline registers — the source of 2,522 DPIR-1
// methodology warnings in Build 9. Converting to sync reset (FDRE) allows
// Vivado to absorb these into DSP48E1 AREG/BREG, further reducing LUT count.
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < TAPS; i = i + 1) begin
delay_line[i] <= 0;
end
end else if (data_valid) begin
for (i = TAPS-1; i > 0; i = i - 1) begin
delay_line[i] <= delay_line[i-1];
end
delay_line[0] <= data_in;
end
end
// ============================================================================
// Pipeline Stage 0b (BREG): Register coefficients
// Runs on data_valid alongside delay_line shift.
// Vivado absorbs into DSP48E1 B-port pipeline register (BREG=1).
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < TAPS; i = i + 1) begin
coeff_reg[i] <= 0;
end
end else if (data_valid) begin
for (i = 0; i < TAPS; i = i + 1) begin
coeff_reg[i] <= coeff[i];
end
end
end
// ============================================================================
// Pipeline Stage 0c (MREG): Register multiply outputs
// Captures combinatorial multiply results one cycle after BREG.
// Vivado absorbs into DSP48E1 M-register (MREG=1).
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < TAPS; i = i + 1) begin
mult_reg[i] <= 0;
end
end else if (valid_pipe[0]) begin
for (i = 0; i < TAPS; i = i + 1) begin
mult_reg[i] <= mult_comb[i];
end
end
end
// ============================================================================
// Pipeline Stage 1 (Level 0): Register 16 pairwise sums of 32 multiply results
// Each addition is a single 36-bit add — one DSP48E1 hop (~1.7ns), fits 10ns.
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// Now uses mult_result (from mult_reg/MREG stage) instead of combinatorial multiply.
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < 16; i = i + 1) begin
add_l0[i] <= 0;
end
end else if (valid_pipe[1]) begin
for (i = 0; i < 16; i = i + 1) begin
// mult_result is (DATA_WIDTH + COEFF_WIDTH) = 36 bits = ACCUM_WIDTH,
// so no sign extension is needed. Direct assignment preserves the
// signed multiply result. (Fixes Vivado Synth 8-693 "zero replication
// count" warning from the original {0{sign_bit}} expression.)
add_l0[i] <= mult_result[2*i] + mult_result[2*i+1];
end
end
end
// ============================================================================
// Pipeline Stage 2 (Level 1): 8 pairwise sums of 16 Level-0 results
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < 8; i = i + 1) begin
add_l1[i] <= 0;
end
end else if (valid_pipe[2]) begin
for (i = 0; i < 8; i = i + 1) begin
add_l1[i] <= add_l0[2*i] + add_l0[2*i+1];
end
end
end
// ============================================================================
// Pipeline Stage 3 (Level 2): 4 pairwise sums of 8 Level-1 results
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
for (i = 0; i < 4; i = i + 1) begin
add_l2[i] <= 0;
end
end else if (valid_pipe[3]) begin
for (i = 0; i < 4; i = i + 1) begin
add_l2[i] <= add_l1[2*i] + add_l1[2*i+1];
end
end
end
// ============================================================================
// Pipeline Stage 4 (Level 3): 2 pairwise sums of 4 Level-2 results
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
add_l3[0] <= 0;
add_l3[1] <= 0;
end else if (valid_pipe[4]) begin
add_l3[0] <= add_l2[0] + add_l2[1];
add_l3[1] <= add_l2[2] + add_l2[3];
end
end
// ============================================================================
// Pipeline Stage 5 (Level 4): Final sum of 2 Level-3 results
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
accumulator_reg <= 0;
end else if (valid_pipe[5]) begin
accumulator_reg <= add_l3[0] + add_l3[1];
end
end
// ============================================================================
// Pipeline Stage 6: Output saturation/rounding (registered)
// Sync reset enables DSP48E1 absorption (fixes DPOR-1 warnings)
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
data_out <= 0;
data_out_valid <= 0;
end else begin
data_out_valid <= valid_pipe[6];
if (valid_pipe[6]) begin
// Output saturation logic
if (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) begin
data_out <= (2**(DATA_WIDTH-1))-1;
end else if (accumulator_reg < -(2**(ACCUM_WIDTH-2))) begin
data_out <= -(2**(DATA_WIDTH-1));
end else begin
// Round and truncate (keep middle bits)
data_out <= accumulator_reg[ACCUM_WIDTH-2:DATA_WIDTH-1];
end
end
end
end
// ============================================================================
// Valid pipeline shift register (9-stage for BREG+MREG+5-level adder+output)
// Sync reset — no DSP48 involvement but keeps reset style consistent with datapath
// ============================================================================
always @(posedge clk) begin
if (!reset_n) begin
valid_pipe <= 9'b000000000;
end else begin
valid_pipe <= {valid_pipe[7:0], data_valid};
end
end
// Always ready to accept new data (fully pipelined)
assign fir_ready = 1'b1;
// Overflow detection
assign filter_overflow = (accumulator_reg > (2**(ACCUM_WIDTH-2)-1)) ||
(accumulator_reg < -(2**(ACCUM_WIDTH-2)));
endmodule