aboutsummaryrefslogtreecommitdiffstats
path: root/fpga/usrp3/lib/rfnoc/multiply.v
blob: ad0353c66b9d3924b5525330a50da8ceaa1c7fa7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
//
// Copyright 2015 Ettus Research
//
// AXI Stream multiplier. Relies on synthesis engine for proper DSP inference.

module multiply #(
  parameter WIDTH_A     = 16,
  parameter WIDTH_B     = 16,
  parameter WIDTH_P     = 32,
  parameter DROP_TOP_P  = 1,  // Default drops extra bit (16-bit signed x 16-bit signed => 31-bits signed)
  parameter LATENCY     = 3,  // multiplier pipeline latency, 0 - 4
  parameter EN_SATURATE = 0,  // Enable saturating output to avoid overflow (adds +1 to latency)
  parameter EN_ROUND    = 0,  // Enable rounding dropped LSBs (adds +1 to latency, total of +2 if used with EN_SATURATE)
  parameter SIGNED      = 1)  // Signed multiply
(
  input clk, input reset,
  input [WIDTH_A-1:0] a_tdata, input a_tlast, input a_tvalid, output a_tready,
  input [WIDTH_B-1:0] b_tdata, input b_tlast, input b_tvalid, output b_tready,
  output [WIDTH_P-1:0] p_tdata, output p_tlast, output p_tvalid, input p_tready
);

  localparam A_LATENCY = (LATENCY == 1) ? 1 :
                         (LATENCY == 2) ? 1 :
                         (LATENCY == 3) ? 2 :
                         (LATENCY == 4) ? 2 : 2;
  localparam B_LATENCY = A_LATENCY;
  localparam P_LATENCY = (LATENCY == 2) ? 1 :
                         (LATENCY == 3) ? 1 :
                         (LATENCY == 4) ? 2 : 2;

  reg [WIDTH_A-1:0]         a_reg[A_LATENCY-1:0];
  reg [WIDTH_B-1:0]         b_reg[B_LATENCY-1:0];
  reg [WIDTH_A+WIDTH_B-1:0] p_reg[P_LATENCY-1:0];

  wire [A_LATENCY-1:0] en_a_reg;
  wire [B_LATENCY-1:0] en_b_reg;
  wire [P_LATENCY-1:0] en_p_reg;
  wire p_int_tlast, p_int_tvalid, p_int_tready;
  axi_pipe_join #(
    .PRE_JOIN_STAGES0(A_LATENCY),
    .PRE_JOIN_STAGES1(B_LATENCY),
    .POST_JOIN_STAGES(P_LATENCY))
  axi_pipe_join (
    .clk(clk), .reset(reset), .clear(1'b0),
    .i0_tlast(a_tlast), .i0_tvalid(a_tvalid), .i0_tready(a_tready),
    .i1_tlast(b_tlast), .i1_tvalid(b_tvalid), .i1_tready(b_tready),
    .o_tlast(p_int_tlast), .o_tvalid(p_int_tvalid), .o_tready(p_int_tready),
    .enables0(en_a_reg), .enables1(en_b_reg), .enables_post(en_p_reg));

  // Multiply
  wire [WIDTH_A+WIDTH_B-1:0] p_mult_signed   = (LATENCY == 0) ? $signed(a_tdata) * $signed(b_tdata) : $signed(a_reg[A_LATENCY-1]) * $signed(b_reg[B_LATENCY-1]);
  wire [WIDTH_A+WIDTH_B-1:0] p_mult_unsigned = (LATENCY == 0) ? a_tdata * b_tdata : a_reg[A_LATENCY-1] * b_reg[B_LATENCY-1];
  wire [WIDTH_A+WIDTH_B-1:0] p_int_tdata     = (LATENCY == 0) ? (SIGNED ? p_mult_signed : p_mult_unsigned) : p_reg[P_LATENCY-1];

  // Register pipeline
  integer i;
  always @(posedge clk) begin
    if (reset) begin
      for (i = 0; i < A_LATENCY; i = i + 1) begin
        a_reg[i] <= 'd0;
      end
      for (i = 0; i < B_LATENCY; i = i + 1) begin
        b_reg[i] <= 'd0;
      end
      for (i = 0; i < P_LATENCY; i = i + 1) begin
        p_reg[i] <= 'd0;
      end
    end else begin
      for (i = 0; i < A_LATENCY; i = i + 1) begin
        if (en_a_reg[i]) begin
          if (i == 0) begin
            a_reg[i] <= $signed(a_tdata);
          end else begin
            a_reg[i] <= a_reg[i-1];
          end
        end
      end
      for (i = 0; i < B_LATENCY; i = i + 1) begin
        if (en_b_reg[i]) begin
          if (i == 0) begin
            b_reg[i] <= $signed(b_tdata);
          end else begin
            b_reg[i] <= b_reg[i-1];
          end
        end
      end
      for (i = 0; i < P_LATENCY; i = i + 1) begin
        if (en_p_reg[i]) begin
          if (i == 0) begin
            p_reg[i] <= SIGNED ? p_mult_signed : p_mult_unsigned;
          end else begin
            p_reg[i] <= p_reg[i-1];
          end
        end
      end
    end
  end

  // Saturate & Round
  // TODO: Might be able to replace axi_round with DSP's built in rounding
  generate
    if ((EN_SATURATE == 1) && (EN_ROUND == 1)) begin
      axi_round_and_clip #(
        .WIDTH_IN(WIDTH_A+WIDTH_B),
        .WIDTH_OUT(WIDTH_P),
        .CLIP_BITS(DROP_TOP_P))
      axi_round_and_clip (
        .clk(clk), .reset(reset),
        .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
        .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
    end else if ((EN_SATURATE == 0) && (EN_ROUND == 1)) begin
      axi_round #(
        .WIDTH_IN(WIDTH_A+WIDTH_B-DROP_TOP_P),
        .WIDTH_OUT(WIDTH_P))
      axi_round (
        .clk(clk), .reset(reset),
        .i_tdata(p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:0]), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
        .o_tdata(p_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
    end else if ((EN_SATURATE == 1) && (EN_ROUND == 0)) begin
      wire [WIDTH_A+WIDTH_B-DROP_TOP_P-1:0] p_clip_tdata;
      axi_clip #(
        .WIDTH_IN(WIDTH_A+WIDTH_B),
        .WIDTH_OUT(WIDTH_A+WIDTH_B-DROP_TOP_P),
        .CLIP_BITS(DROP_TOP_P))
      axi_clip (
        .clk(clk), .reset(reset),
        .i_tdata(p_int_tdata), .i_tlast(p_int_tlast), .i_tvalid(p_int_tvalid), .i_tready(p_int_tready),
        .o_tdata(p_clip_tdata), .o_tlast(p_tlast), .o_tvalid(p_tvalid), .o_tready(p_tready));
      assign p_tdata = p_clip_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P];
    end else begin
      assign p_tdata      = p_int_tdata[WIDTH_A+WIDTH_B-DROP_TOP_P-1:WIDTH_A+WIDTH_B-DROP_TOP_P-WIDTH_P];
      assign p_tlast      = p_int_tlast;
      assign p_tvalid     = p_int_tvalid;
      assign p_int_tready = p_tready;
    end
  endgenerate

endmodule