// // Copyright 2017 Ettus Research // Copyright 2018 Ettus Research, a National Instruments Company // // SPDX-License-Identifier: LGPL-3.0-or-later // // Parameterized FIR filter with AXI stream interface. // Has several optimizations to resource utilization such as // using half the number of DSP slices for symmetric coefficients, // skipping coefficients that are always set to zero, and using // internal DSP slice registers to hold coefficients. // // For the most efficient DSP slice inference use these settings: // - IN_WIDTH < 25, COEFF_WIDTH < 18, ACCUM_WIDTH < 48 // // Parameters: // IN_WIDTH - Input width // COEFF_WIDTH - Coefficient width // OUT_WIDTH - Output width // NUM_COEFFS - Number of coefficients / taps // CLIP_BITS - If IN_WIDTH != OUT_WIDTH, number of MSBs to drop // ACCUM_WIDTH - Accumulator width // COEFFS_VEC - Vector of NUM_COEFFS values each of width COEFF_WIDTH to // initialize coeffs. Defaults to an impulse. // RELOADABLE_COEFFS - Enable (1) or disable (0) reloading coefficients at runtime (via reload bus) // BLANK_OUTPUT - Disable (1) or enable (0) output when initially filling internal pipeline // SYMMETRIC_COEFFS - Reduce multiplier usage by approx half if coefficients are symmetric // SKIP_ZERO_COEFFS - Reduce multiplier usage by assuming zero valued coefficients in // DEFAULT_COEFFS are always zero. Useful for halfband filters. // USE_EMBEDDED_REGS_COEFFS - Reduce register usage by only using embedded registers in DSP slices. // Updating taps while streaming will cause temporary output corruption! // // Notes: // - If using USE_EMBEDDED_REGS_COEFFS, coefficients must be written at least once as COEFFS_VEC is ignored! // - If using SYMMETRIC_COEFFS, only send half the coeffients! i.e. NUM_COEFFS = 11, send the first 6. // module axi_fir_filter #( parameter IN_WIDTH = 16, parameter COEFF_WIDTH = 16, parameter OUT_WIDTH = 16, parameter NUM_COEFFS = 41, parameter CLIP_BITS = $clog2(NUM_COEFFS), parameter ACCUM_WIDTH = IN_WIDTH+COEFF_WIDTH+$clog2(NUM_COEFFS)-1, parameter [NUM_COEFFS*COEFF_WIDTH-1:0] COEFFS_VEC = {{1'b0,{(COEFF_WIDTH-1){1'b1}}},{(COEFF_WIDTH*(NUM_COEFFS-1)){1'b0}}}, parameter RELOADABLE_COEFFS = 1, parameter BLANK_OUTPUT = 1, // Optimizations parameter SYMMETRIC_COEFFS = 1, parameter SKIP_ZERO_COEFFS = 0, parameter USE_EMBEDDED_REGS_COEFFS = 1 )( input clk, input reset, input clear, input [IN_WIDTH-1:0] s_axis_data_tdata, input s_axis_data_tlast, input s_axis_data_tvalid, output s_axis_data_tready, output [OUT_WIDTH-1:0] m_axis_data_tdata, output m_axis_data_tlast, output m_axis_data_tvalid, input m_axis_data_tready, input [COEFF_WIDTH-1:0] s_axis_reload_tdata, input s_axis_reload_tvalid, input s_axis_reload_tlast, output s_axis_reload_tready ); localparam NUM_SLICES = SYMMETRIC_COEFFS ? NUM_COEFFS/2 + NUM_COEFFS[0] : // Manual round up, Vivado complains when using $ceil() NUM_COEFFS; localparam ODD_LEN = NUM_COEFFS[0]; localparam PIPELINE_DELAY = NUM_SLICES+4; // +4 pipeline depth in fir_filter_slice.v wire [ACCUM_WIDTH-1:0] m_axis_data_tdata_int; wire m_axis_data_tvalid_int, m_axis_data_tready_int, m_axis_data_tlast_int; /////////////////////////////////////////////////////// // // Coefficient loading / reloading // /////////////////////////////////////////////////////// reg [COEFF_WIDTH-1:0] coeffs[0:NUM_SLICES-1]; reg coeff_load_stb = 1'b1; generate integer k; if (RELOADABLE_COEFFS) begin // Use DSP slice registers to hold coefficients. While loading // coefficients, input sample data should be throttled if corrupted // output samples are unacceptable. if (USE_EMBEDDED_REGS_COEFFS) begin always @(*) begin coeff_load_stb <= s_axis_reload_tvalid & s_axis_reload_tready; end // Use shift register to hold coefficients. Coefficients are loaded // into fir filter slice on tlast. end else begin always @(posedge clk) begin if (reset | clear) begin for (k = 0; k < NUM_SLICES; k = k + 1) begin coeffs[k] <= COEFFS_VEC[COEFF_WIDTH*k +: COEFF_WIDTH]; end // Initialize coefficients at reset coeff_load_stb <= 1'b1; end else begin if (s_axis_reload_tvalid & s_axis_reload_tready) begin for (k = NUM_SLICES-1; k > 0; k = k - 1) begin coeffs[k-1] <= coeffs[k]; end coeffs[NUM_SLICES-1] <= s_axis_reload_tdata; end coeff_load_stb <= s_axis_reload_tvalid & s_axis_reload_tready & s_axis_reload_tlast; end end end // Coefficients are static end else begin initial begin for (k = 0; k < NUM_SLICES; k = k + 1) begin coeffs[k] <= COEFFS_VEC[COEFF_WIDTH*k +: COEFF_WIDTH]; coeff_load_stb <= 1'b1; end end end endgenerate assign s_axis_reload_tready = 1'b1; /////////////////////////////////////////////////////// // // Systolic FIR Filter // /////////////////////////////////////////////////////// // // Block Diagram // - Configuration: SYMMETRIC_COEFFS = 1 and USE_EMBEDDED_REGS_COEFFS = 1 // // +-------+ // Sample In | Shift | Sample In delayed NUM_COEFF // +-------->| Reg |-------------------------------------------------------------> // | +-------+ | | // | v v // | +-----+ +-----+ // | | | | | // | +-----+ +-----+ // | | | // | +--+ +--+ | Sample +--+ +--+ | // | | | | | | Forward | | | | | // '-->| |-->| |-----------^---------->| |-->| |----------^--------------> // | | | | | | | | | | | | // +--+ +--+ v v +--+ +--+ v v // +------------+ +------------+ // | Pre-Adder | | Pre-Adder | // +------------+ +------------+ // | | // v v // +-----+ +-----+ // *----------------------* | | | | // | Note: Coeffs are | +-----+ +-----+ // | loaded backwards | | | // | for proper alignment | | .----------------------^----------------< // *----------------------* | | | // +--+ +--+ v | +--+ +--+ v // Coeff In | | | | +------------+ | | | | | +------------+ // .--->| |-->| |-->| Multiplier | '-->| |-->| |->| Multiplier | // | | | | | +------------+ | | | | | +------------+ // | +--+ +--+ | +--+ | +--+ | // | | | | // '------------------------^------------------' | // | Coeff | // v Forward v // +-----+ +-----+ // | | | | // +-----+ +-----+ // | | // v +--+ Sample v +--+ // +------------+ | | Out +------------+ | | // | Adder |-->| |----------->| Adder |-->| |--> // +------------+ | | +------------+ | | // +--+ +--+ // /////////////////////////////////////////////////////// genvar i, l; generate // Counter to track pipeline fullness reg [$clog2(PIPELINE_DELAY):0] cnt; always @(posedge clk) begin if (reset | clear) begin cnt <= 0; end else if (s_axis_data_tvalid & s_axis_data_tready) begin if (cnt < PIPELINE_DELAY) begin cnt <= cnt + 1; end end end // Sample delay shift register for efficient implementation // when using symmetric coefficients reg [IN_WIDTH-1:0] sample_shift_reg[0:NUM_COEFFS-1]; integer n; initial begin for (n = 0; n < NUM_COEFFS; n = n + 1) begin sample_shift_reg[n] <= 0; end end always @(posedge clk) begin if (s_axis_data_tvalid & s_axis_data_tready) begin for (n = 1; n < NUM_COEFFS; n = n + 1) begin sample_shift_reg[n] <= sample_shift_reg[n-1]; end sample_shift_reg[0] <= s_axis_data_tdata; end end // tlast shift register reg [PIPELINE_DELAY-1:0] tlast_shift_reg = 0; integer m; always @(posedge clk) begin if (s_axis_data_tvalid & s_axis_data_tready) begin for (m = 1; m < PIPELINE_DELAY; m = m + 1) begin tlast_shift_reg[m] <= tlast_shift_reg[m-1]; end tlast_shift_reg[0] <= s_axis_data_tlast; end end wire [IN_WIDTH-1:0] sample_in[0:NUM_SLICES]; // Use [0:NUM_SLICES] instead of wire [ACCUM_WIDTH-1:0] sample_accum[0:NUM_SLICES]; // [0:NUM_SLICES-1] to make the wire [COEFF_WIDTH-1:0] coeff_forward[0:NUM_SLICES]; // generate loop easier to read assign sample_in[0] = s_axis_data_tdata; assign sample_accum[0] = 0; assign coeff_forward[NUM_SLICES] = s_axis_reload_tdata; // Build up FIR filter with multiply-accumulate slices (fir_filter_slice) for (i = 0; i < NUM_SLICES; i = i + 1) begin // Map zero'd out coefficients to simple register delays. if ((SKIP_ZERO_COEFFS == 1) && (COEFFS_VEC[COEFF_WIDTH*i +: COEFF_WIDTH] == 0)) begin reg [ACCUM_WIDTH-1:0] sample_accum_reg; reg [IN_WIDTH-1:0] sample_in_reg[0:1]; reg [COEFF_WIDTH-1:0] coeff_in_reg; always @(posedge clk) begin if (reset | clear) begin sample_in_reg[0] <= 0; sample_in_reg[1] <= 0; sample_accum_reg <= 0; coeff_in_reg <= 0; end else begin if (s_axis_data_tvalid & s_axis_data_tready) begin sample_in_reg[0] <= sample_in[i]; sample_in_reg[1] <= sample_in_reg[0]; sample_accum_reg <= sample_accum[i]; end if (coeff_load_stb) begin coeff_in_reg <= coeff_forward[i+1]; end end end assign sample_in[i+1] = sample_in_reg[1]; assign sample_accum[i+1] = sample_accum_reg; assign coeff_forward[i] = coeff_in_reg; end else begin fir_filter_slice #( .IN_WIDTH(IN_WIDTH), .COEFF_WIDTH(COEFF_WIDTH), .ACCUM_WIDTH(ACCUM_WIDTH), .OUT_WIDTH(ACCUM_WIDTH)) fir_filter_slice ( .clk(clk), .reset(reset), .clear(clear), .sample_in_stb(s_axis_data_tvalid & s_axis_data_tready), .sample_in_a(sample_in[i]), // sample_in_b is used to implement symmetric coefficients, always 0 if SYMMETRIC_COEFFS = 0 .sample_in_b(((SYMMETRIC_COEFFS == 0) || ((ODD_LEN == 1) && (i == NUM_SLICES-1))) ? {IN_WIDTH{1'b0}} : sample_shift_reg[NUM_COEFFS-1]), .sample_forward(sample_in[i+1]), // For proper coeffient loading, coeff_forward must be shifted in backwards. coeffs[] is already backwards. .coeff_in(((USE_EMBEDDED_REGS_COEFFS == 1) && (RELOADABLE_COEFFS == 1)) ? coeff_forward[i+1] : coeffs[i]), .coeff_forward(coeff_forward[i]), .coeff_load_stb(coeff_load_stb), .sample_accum(sample_accum[i]), .sample_out(sample_accum[i+1])); end end assign m_axis_data_tdata_int = (BLANK_OUTPUT == 1) & (cnt < PIPELINE_DELAY) ? 0 : sample_accum[NUM_SLICES]; assign m_axis_data_tvalid_int = (BLANK_OUTPUT == 1) & (cnt < PIPELINE_DELAY) ? 1'b0 : s_axis_data_tvalid; assign m_axis_data_tlast_int = (BLANK_OUTPUT == 1) ? ((cnt < PIPELINE_DELAY) ? 1'b0 : tlast_shift_reg[PIPELINE_DELAY-1]) : s_axis_data_tlast; assign s_axis_data_tready = m_axis_data_tready_int; endgenerate axi_round_and_clip #( .WIDTH_IN(ACCUM_WIDTH), .WIDTH_OUT(OUT_WIDTH), .CLIP_BITS(CLIP_BITS)) inst_axi_round_and_clip ( .clk(clk), .reset(reset | clear), .i_tdata(m_axis_data_tdata_int), .i_tlast(m_axis_data_tlast_int), .i_tvalid(m_axis_data_tvalid_int), .i_tready(m_axis_data_tready_int), .o_tdata(m_axis_data_tdata), .o_tlast(m_axis_data_tlast), .o_tvalid(m_axis_data_tvalid), .o_tready(m_axis_data_tready)); endmodule