乘法器

非常简单。

1
2
3
4
5
6
7
module MUL(
input logic [31:0] src1,
input logic [31:0] src2,
output logic [63:0] out
)
assign out = src1 * src2;
endmodule

结束。


我们肯定不能这样子实现乘法器——即使这种写法能被综合为DSP模块。为什么?因为乘法实在是太太太长了,时间太久。如果将这么一大坨逻辑塞在EX级,时钟频率一定会非常难看。

怎么办?我们将乘法拆分开,在每个时钟周期实现一部分。这样子,可以稍微改善一点时序。

四级流水线乘法器 MUL.sv

首先思考一下:多周期的流水线乘法器会带来哪些额外的时序控制与竞争冒险?

首先是模块要给出信号,来表示自己“是否完成当前运算”以及“是否能接受新的运算”。此外,在执行时,需要将乘法指令用到的寄存器与写回的目标寄存器记住,否则当指令在旁流水线的乘法模块执行时,结果尚未算出,但后面一条指令需要用到结果,这样就必须阻塞流水线。再如,计算后写回时,如果写回的目标寄存器与目前执行完毕的指令写回寄存器一致,则应当选择最后的值进行写回。要考虑的还真不少。

乘法指令拆分

对于两个32位的数相乘,我们可以拆成低16位和高16位,再两两相乘,最后将四部分相加。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
// Stage 1
pp_ll_comb = src1_signed[15:0] * src2_signed[15:0];

// Stage 2
always_comb begin
// A_lo * B_hi (16-bit unsigned * 17-bit signed = 33-bit signed)
pp_lh_comb = $signed({1'b0, s1_a_lo}) * s1_b_hi;
// A_hi * B_lo (17-bit signed * 16-bit unsigned = 33-bit signed)
pp_hl_comb = s1_a_hi * $signed({1'b0, s1_b_lo});
end

// Stage 3
always_comb begin
// A_hi * B_hi (17-bit signed * 17-bit signed = 34-bit signed)
pp_hh_comb = s2_a_hi * s2_b_hi;
// Sum of middle partial products (with sign extension)
pp_mid_comb = $signed(s2_pp_lh) + $signed(s2_pp_hl);
end

// Stage 4
product_comb = {32'b0, s3_pp_ll} + ({{30{s3_pp_mid[33]}}, s3_pp_mid} << 16) +
({{30{s3_pp_hh[33]}}, s3_pp_hh} << 32);

流水线传递信号

我们要在内部传递一堆计算值,以及当前乘法计算是否有效的信号。在最后一级时传出,代表准备输出。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s1_valid <= 1'b0;
s1_op <= 2'b0;
s1_rd <= 5'b0;
s1_canceled <= 1'b0;
s1_a_hi <= 17'b0;
s1_b_hi <= 17'b0;
s1_a_lo <= 16'b0;
s1_b_lo <= 16'b0;
s1_pp_ll <= 32'b0;
end else if (flush_i) begin
s1_valid <= 1'b0;
s1_op <= 2'b0;
s1_rd <= 5'b0;
s1_canceled <= 1'b0;
s1_a_hi <= 17'b0;
s1_b_hi <= 17'b0;
s1_a_lo <= 16'b0;
s1_b_lo <= 16'b0;
s1_pp_ll <= 32'b0;
end else begin
s1_valid <= mul_valid_i;
s1_op <= mul_op_i;
s1_rd <= mul_rd_i;
// Check if this stage should be canceled (WAW without RAW)
s1_canceled <= (cancel_rd_i != 5'b0) && (cancel_rd_i == mul_rd_i) && mul_valid_i;
// Store split operands for next stage
s1_a_hi <= src1_signed[32:16];
s1_b_hi <= src2_signed[32:16];
s1_a_lo <= src1_signed[15:0];
s1_b_lo <= src2_signed[15:0];
s1_pp_ll <= pp_ll_comb;
end
end

// ...

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n) begin
s4_valid <= 1'b0;
s4_op <= 2'b0;
s4_rd <= 5'b0;
s4_canceled <= 1'b0;
s4_product <= 64'b0;
end else if (flush_i) begin
s4_valid <= 1'b0;
s4_op <= 2'b0;
s4_rd <= 5'b0;
s4_canceled <= 1'b0;
s4_product <= 64'b0;
end else begin
s4_valid <= s3_valid;
s4_op <= s3_op;
s4_rd <= s3_rd;
// Propagate cancel or detect new cancel for this stage
s4_canceled <= s3_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s3_rd) && s3_valid);
s4_product <= product_comb;
end
end

RegisterF 修改

因为我们额外设置了乘法器的计算结果,它不能和主流水线的计算结果一同写入,因为寄存器堆只有一个写端口。如果区分先后写入的话,则又要阻塞一个周期,而我们设置成流水线级的乘法器为的就是尽可能减少阻塞。因此,修改寄存器堆为双端口写回:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
// 写入使用时序逻辑 - 支持双写端口
// 当两个端口同时写入同一寄存器时主流水线端口优先
// 因乘法为长指令 后写回的短指令在时序上更靠后因此结果更新
// 应当采用更新的寄存器值
always_ff @(posedge clk) begin
// 主流水线写端口
if (rf_we && wR != 5'd0) begin
rf_in[wR] <= wD;
end
// 乘法器写端口(优先级更低)
if (rf_we2 && wR2 != 5'd0 && !(rf_we && wR == wR2)) begin
rf_in[wR2] <= wD2;
end
end

HazardUnit 修改

新的冲突,新的分析

对于双端口的写回寄存器来说,乘法计算结果与主流水线的计算结果独立,因此在写回时,可能出现写到同一个目标寄存器的现象。参考上面的注释,乘法为长指令,后写回的短指令在时序上更靠后,因此计算结果较新。按照ISA规范,后执行的指令必须最后写入,否则会破坏程序正确性。

首先检测所有可能的 WAW 冲突,若乘法器中该级有有效的MUL指令,且ID级要写寄存器并不为x0,以及目标寄存器相同,则存在WAW 冲突。

如果 ID 指令读取了 MUL 的目标寄存器,说明这是个WAW+RAW的冲突,必须为RAW冲突阻塞流水线,因为乘法结果在EX级之后的第四个周期才计算完成:

1
2
3
4
5
6
7
8
9
10
11
12
# Cycle 1
MUL x5, x1, x2 # S1, 将写 x5

# Cycle 2
ADD x6, x5, x7 # ID, 读 x5(RAW依赖)

# Cycle 3
MUL x5, x3, x4 # ID, 读 x5,写 x5(WAW + RAW)
# 检测结果:
# mul_waw_conflict[1] = 1 (与S1的第一条MUL冲突)
# id_reads_mul_rd[1] = 1 (ID读取了x5)
# mul_waw_hazard = 1 → 停顿流水线
Cycle PC IF ID EX MUL S1→S4 操作
1 0x00 MUL1 - - MUL1进入IF
2 0x04 ADD MUL1 检测WAW+RAW
mul_waw_hazard=1
3 MUL2 ADD MUL1 - (Data In) 停顿(保持PC)
4 S1 停顿
5 S2
6 S3
7 S4 (WriteBack) MUL1完成,ADD读到正确x5
8 0x08 MUL2 ADD S1 ADD进行运算,MUL2进入ID

此外,还存在纯WAW冲突,即“两个写回端口相同”,但不存在RAW冲突:

1
2
3
4
5
6
7
8
9
10
# Cycle 1
MUL x5, x1, x2 # S1, 将写 x5

# Cycle 2
ADD x5, x3, x4 # ID, 写 x5,但不读 x5(仅WAW)
# 检测结果:
# mul_waw_conflict[1] = 1 (与S1的MUL冲突)
# id_reads_mul_rd[1] = 0 (ID不读x5)
# pure_waw_conflict = 1
# mul_cancel_rd = x5 → 取消第一条MUL的写回

对于这一种冲突,使用mul_cancel_rd信号来设置取消写回的寄存器地址。

Cycle PC IF ID EX MEM WB MUL S1→S4 操作
1 0x00 MUL - - - - - MUL进入IF
2 0x04 ADD MUL 检测纯WAW
mul_cancel_rd=x5
3 0x08 ... ADD MUL - (Data In) MUL标记为取消
4 0x0C ... ADD MUL S1 (canceled) -
5 0x10 ... ADD MUL S2 (canceled)
6 0x14 ... ADD S3 (canceled) ADD写x5
7 0x18 ... S4 (No WriteBack) MUL不写x5

注意这里的乘法器实际经过的流水线为IF/ID/S1/S2/S3/S4

WAW依赖相关逻辑

修改HazardUnit模块内部,加入传出的乘法器内流水线寄存器暂存的信号:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
module HazardUnit(
// ..
// 乘法器状态信号 (4级流水线)
// 约定:mul_stage_busy[0]=S1 ... [3]=S4;mul_rd_s[0]=S1 ... [3]=S4
input logic [ 3:0] mul_stage_busy, // 乘法器各级流水线忙状态
input logic [ 3:0][4:0] mul_rd_s, // 乘法器各级流水线目标寄存器地址
input logic is_mul_instr_ID, // ID级是否为乘法指令
input logic is_mul_instr_EX, // EX级是否为乘法指令
// ..
// 乘法器写回无效化信号 (WAW冒险时取消MUL写回)
output logic [ 4:0] mul_cancel_rd
)


// ------------------------------------------------------------
// MUL 冒险判断(RAW + 结构冒险 + WAW处理)
// 乘法器 4 级流水:S1->S2->S3->S4(结果在S4末尾可用)
// ------------------------------------------------------------
logic mul_use_hazard;
logic mul_struct_hazard;
logic mul_waw_hazard;
logic pure_waw_conflict;

// 结构冒险:S1被占用时,新的乘法指令不能进入
assign mul_struct_hazard = is_mul_instr_ID && mul_stage_busy[0];

// 将 EX + S1..S4 统一成 5 路,便于循环处理
// mul_rd_all[0]=EX,mul_rd_all[1]=S1 ... mul_rd_all[4]=S4
logic [4:0][4:0] mul_rd_all;
logic [4:0] mul_vld_all;
assign mul_rd_all = {mul_rd_s, wR_EX};
assign mul_vld_all = {mul_stage_busy, is_mul_instr_EX};

// Debug/可视化向量(可在波形里直接看每一级是否命中)
logic [4:0] mul_raw_hit_r1;
logic [4:0] mul_raw_hit_r2;
logic [4:0] id_reads_mul_rd;
logic [4:0] mul_waw_conflict;

always_comb begin
mul_raw_hit_r1 = '0;
mul_raw_hit_r2 = '0;
id_reads_mul_rd = '0;
mul_waw_conflict = '0;

for (int i = 0; i < 5; i++) begin
// RAW:ID读取 rR1/rR2,且命中任一在飞MUL的rd
mul_raw_hit_r1[i] = mul_vld_all[i] && (mul_rd_all[i] != 5'd0) && rs1_used_ID &&
(mul_rd_all[i] == rR1_ID);
mul_raw_hit_r2[i] = mul_vld_all[i] && (mul_rd_all[i] != 5'd0) && rs2_used_ID &&
(mul_rd_all[i] == rR2_ID);

// ID是否读取了该rd(用于区分 WAW 需要停顿 / 仅取消写回)
id_reads_mul_rd[i] = (mul_rd_all[i] != 5'd0) &&
((rs1_used_ID && (rR1_ID == mul_rd_all[i])) ||
(rs2_used_ID && (rR2_ID == mul_rd_all[i])));

// WAW冲突:ID将写 wR_ID,且与某级MUL rd 相同
mul_waw_conflict[i] = mul_vld_all[i] && rf_we_ID && (wR_ID != 5'd0) &&
(mul_rd_all[i] == wR_ID);
end

mul_use_hazard = (|mul_raw_hit_r1) || (|mul_raw_hit_r2);

// WAW冒险:只有 WAW + 同时存在RAW读依赖 才需要停顿
mul_waw_hazard = |(mul_waw_conflict & id_reads_mul_rd);
pure_waw_conflict = |(mul_waw_conflict & ~id_reads_mul_rd);
end

// 纯WAW冲突(无RAW依赖)时,取消MUL对该寄存器的写回
assign mul_cancel_rd = pure_waw_conflict ? wR_ID : 5'd0;

// ------------------------------------------------------------
// 流水线冲刷与停顿
// ------------------------------------------------------------
logic any_hazard;
assign any_hazard = load_use_hazard || mul_use_hazard || mul_struct_hazard || mul_waw_hazard;

always_comb begin
keep_pc = any_hazard;
stall_IF_ID = any_hazard;
flush_IF_ID = branch_predicted_result;
flush_ID_EX = (branch_predicted_result || any_hazard);
end

endmodule

写回数据来源选择

还得修改一下位于WB级的写回数据选择MUX:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
// rf_wd_WB保留用于前递(需要考虑乘法结果)
logic [31:0] rf_wd_WB_final;
logic mul_result_forwarded;
always_comb begin
mul_result_forwarded = mul_valid_o && mul_rf_we_o && (mul_rd_o == rR1 || mul_rd_o == rR2);
if (mul_result_forwarded) begin
// 乘法结果用于前递
rf_wd_WB_final = mul_result;
end else if (wd_sel_WB == `WD_SEL_FROM_DRAM) begin
rf_wd_WB_final = load_data_WB;
end else begin
rf_wd_WB_final = rf_wd_WB_from_ALU;
end
end

之后再进行时序分析,发现关键路径跑到了MUL上,说明乘法真的很长。其实可以再将乘法拆开为六级的。

上面的MUX写法一开始为:

1
2
3
4
if (mul_valid_o && mul_rf_we_o) begin
// 乘法结果用于前递
rf_wd_WB_final = mul_result;
end

但是当乘法器与计算结果同一个时期写回时,前递数据会优先选择乘法器的,而不会判断ID级要读取的源寄存器是否与乘法器一致!

这个问题在四级乘法器时没有被发现,但是到了六级的时候被发现了。

采用Booth编码与华莱士树的六级乘法器

手写是不可能手写的,让AI帮忙吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
`include "include/defines.svh"
// 六级流水线乘法器 - Radix-4 Booth编码 + Wallace Tree
// 使用Booth编码减少部分积数量,Wallace Tree并行归约
// 显著降低关键路径延时

module MUL (
input logic clk,
input logic rst_n,

// 来自 ID 阶段的输入
input logic mul_valid_i,
input logic [ 1:0] mul_op_i,
input logic [31:0] mul_src1_i,
input logic [31:0] mul_src2_i,
input logic [ 4:0] mul_rd_i,
// 流水线冲刷
input logic flush_i,
// WAW 冒险取消写回
input logic [ 4:0] cancel_rd_i,
// 输出到 WB 阶段
output logic mul_valid_o,
output logic [31:0] mul_result_o,
output logic [ 4:0] mul_rd_o,
output logic mul_rf_we_o,
// 流水线状态
output logic mul_busy_o,
output logic [ 5:0] mul_stage_busy_o,
output logic [ 5:0][4:0] mul_rd_s_o
);

// 乘法操作类型
localparam MUL_OP_MUL = 2'b00;
localparam MUL_OP_MULH = 2'b01;
localparam MUL_OP_MULHSU = 2'b10;
localparam MUL_OP_MULHU = 2'b11;

// Booth编码常量 (使用 localparam 代替 enum)
localparam [2:0] BOOTH_0 = 3'b000; // 0
localparam [2:0] BOOTH_P1 = 3'b001; // +1
localparam [2:0] BOOTH_P2 = 3'b010; // +2
localparam [2:0] BOOTH_N2 = 3'b011; // -2
localparam [2:0] BOOTH_N1 = 3'b100; // -1

// =====================================================================
// 3: 2 CSA 压缩器宏定义
// =====================================================================
`define CSA_3_2(a, b, c, sum, carry) \
assign sum = (a) ^ (b) ^ (c); \
assign carry = (((a) & (b)) | ((b) & (c)) | ((a) & (c))) << 1;

// ======================== Stage 1: Booth编码 ========================
logic s1_valid;
logic [ 1:0] s1_op;
logic [ 4:0] s1_rd;
logic s1_canceled;
logic signed [33:0] s1_multiplicand;
logic [ 2:0] s1_booth_enc [16:0]; // 使用 logic [2:0] 代替 enum

// 符号扩展逻辑
logic signed [33:0] multiplicand_ext;
logic signed [34:0] multiplier_ext;

// Booth编码组合逻辑
logic [ 2:0] booth_enc_comb [16:0];

// Booth编码函数
function automatic logic [2:0] booth_encode(input logic [2:0] bits);
case (bits)
3'b000: return BOOTH_0;
3'b001: return BOOTH_P1;
3'b010: return BOOTH_P1;
3'b011: return BOOTH_P2;
3'b100: return BOOTH_N2;
3'b101: return BOOTH_N1;
3'b110: return BOOTH_N1;
3'b111: return BOOTH_0;
default: return BOOTH_0;
endcase
endfunction

always_comb begin
case (mul_op_i)
MUL_OP_MUL, MUL_OP_MULH: begin
multiplicand_ext = {{2{mul_src1_i[31]}}, mul_src1_i};
multiplier_ext = {{3{mul_src2_i[31]}}, mul_src2_i};
end
MUL_OP_MULHSU: begin
multiplicand_ext = {{2{mul_src1_i[31]}}, mul_src1_i};
multiplier_ext = {3'b0, mul_src2_i};
end
MUL_OP_MULHU: begin
multiplicand_ext = {2'b0, mul_src1_i};
multiplier_ext = {3'b0, mul_src2_i};
end
default: begin
multiplicand_ext = 34'b0;
multiplier_ext = 35'b0;
end
endcase

// 生成17个Booth编码
booth_enc_comb[0] = booth_encode({multiplier_ext[1:0], 1'b0});
booth_enc_comb[1] = booth_encode(multiplier_ext[3:1]);
booth_enc_comb[2] = booth_encode(multiplier_ext[5:3]);
booth_enc_comb[3] = booth_encode(multiplier_ext[7:5]);
booth_enc_comb[4] = booth_encode(multiplier_ext[9:7]);
booth_enc_comb[5] = booth_encode(multiplier_ext[11:9]);
booth_enc_comb[6] = booth_encode(multiplier_ext[13:11]);
booth_enc_comb[7] = booth_encode(multiplier_ext[15:13]);
booth_enc_comb[8] = booth_encode(multiplier_ext[17:15]);
booth_enc_comb[9] = booth_encode(multiplier_ext[19:17]);
booth_enc_comb[10] = booth_encode(multiplier_ext[21:19]);
booth_enc_comb[11] = booth_encode(multiplier_ext[23:21]);
booth_enc_comb[12] = booth_encode(multiplier_ext[25:23]);
booth_enc_comb[13] = booth_encode(multiplier_ext[27:25]);
booth_enc_comb[14] = booth_encode(multiplier_ext[29:27]);
booth_enc_comb[15] = booth_encode(multiplier_ext[31:29]);
booth_enc_comb[16] = booth_encode(multiplier_ext[33:31]);
end

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s1_valid <= 1'b0;
s1_op <= 2'b0;
s1_rd <= 5'b0;
s1_canceled <= 1'b0;
s1_multiplicand <= 34'b0;
s1_booth_enc[0] <= BOOTH_0;
s1_booth_enc[1] <= BOOTH_0;
s1_booth_enc[2] <= BOOTH_0;
s1_booth_enc[3] <= BOOTH_0;
s1_booth_enc[4] <= BOOTH_0;
s1_booth_enc[5] <= BOOTH_0;
s1_booth_enc[6] <= BOOTH_0;
s1_booth_enc[7] <= BOOTH_0;
s1_booth_enc[8] <= BOOTH_0;
s1_booth_enc[9] <= BOOTH_0;
s1_booth_enc[10] <= BOOTH_0;
s1_booth_enc[11] <= BOOTH_0;
s1_booth_enc[12] <= BOOTH_0;
s1_booth_enc[13] <= BOOTH_0;
s1_booth_enc[14] <= BOOTH_0;
s1_booth_enc[15] <= BOOTH_0;
s1_booth_enc[16] <= BOOTH_0;
end else begin
s1_valid <= mul_valid_i;
s1_op <= mul_op_i;
s1_rd <= mul_rd_i;
s1_canceled <= (cancel_rd_i != 5'b0) && (cancel_rd_i == mul_rd_i) && mul_valid_i;
s1_multiplicand <= multiplicand_ext;
s1_booth_enc[0] <= booth_enc_comb[0];
s1_booth_enc[1] <= booth_enc_comb[1];
s1_booth_enc[2] <= booth_enc_comb[2];
s1_booth_enc[3] <= booth_enc_comb[3];
s1_booth_enc[4] <= booth_enc_comb[4];
s1_booth_enc[5] <= booth_enc_comb[5];
s1_booth_enc[6] <= booth_enc_comb[6];
s1_booth_enc[7] <= booth_enc_comb[7];
s1_booth_enc[8] <= booth_enc_comb[8];
s1_booth_enc[9] <= booth_enc_comb[9];
s1_booth_enc[10] <= booth_enc_comb[10];
s1_booth_enc[11] <= booth_enc_comb[11];
s1_booth_enc[12] <= booth_enc_comb[12];
s1_booth_enc[13] <= booth_enc_comb[13];
s1_booth_enc[14] <= booth_enc_comb[14];
s1_booth_enc[15] <= booth_enc_comb[15];
s1_booth_enc[16] <= booth_enc_comb[16];
end
end

// ======================== Stage 2: 部分积生成 ========================
logic s2_valid;
logic [ 1:0] s2_op;
logic [ 4:0] s2_rd;
logic s2_canceled;
logic signed [67:0] s2_pp [16:0];

// 部分积生成函数
function automatic logic signed [67:0] gen_partial_product(
input logic [2:0] enc, input logic signed [33:0] multiplicand, input int shift);
logic signed [67:0] result;
case (enc)
BOOTH_0: result = 68'sb0;
BOOTH_P1: result = {{34{multiplicand[33]}}, multiplicand} << shift;
BOOTH_P2: result = {{33{multiplicand[33]}}, multiplicand, 1'b0} << shift;
BOOTH_N1: result = (-{{34{multiplicand[33]}}, multiplicand}) << shift;
BOOTH_N2: result = (-{{33{multiplicand[33]}}, multiplicand, 1'b0}) << shift;
default: result = 68'sb0;
endcase
return result;
endfunction

// 部分积组合逻辑
logic signed [67:0] pp_comb[16:0];

always_comb begin
pp_comb[0] = gen_partial_product(s1_booth_enc[0], s1_multiplicand, 0);
pp_comb[1] = gen_partial_product(s1_booth_enc[1], s1_multiplicand, 2);
pp_comb[2] = gen_partial_product(s1_booth_enc[2], s1_multiplicand, 4);
pp_comb[3] = gen_partial_product(s1_booth_enc[3], s1_multiplicand, 6);
pp_comb[4] = gen_partial_product(s1_booth_enc[4], s1_multiplicand, 8);
pp_comb[5] = gen_partial_product(s1_booth_enc[5], s1_multiplicand, 10);
pp_comb[6] = gen_partial_product(s1_booth_enc[6], s1_multiplicand, 12);
pp_comb[7] = gen_partial_product(s1_booth_enc[7], s1_multiplicand, 14);
pp_comb[8] = gen_partial_product(s1_booth_enc[8], s1_multiplicand, 16);
pp_comb[9] = gen_partial_product(s1_booth_enc[9], s1_multiplicand, 18);
pp_comb[10] = gen_partial_product(s1_booth_enc[10], s1_multiplicand, 20);
pp_comb[11] = gen_partial_product(s1_booth_enc[11], s1_multiplicand, 22);
pp_comb[12] = gen_partial_product(s1_booth_enc[12], s1_multiplicand, 24);
pp_comb[13] = gen_partial_product(s1_booth_enc[13], s1_multiplicand, 26);
pp_comb[14] = gen_partial_product(s1_booth_enc[14], s1_multiplicand, 28);
pp_comb[15] = gen_partial_product(s1_booth_enc[15], s1_multiplicand, 30);
pp_comb[16] = gen_partial_product(s1_booth_enc[16], s1_multiplicand, 32);
end

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s2_valid <= 1'b0;
s2_op <= 2'b0;
s2_rd <= 5'b0;
s2_canceled <= 1'b0;
s2_pp[0] <= 68'b0;
s2_pp[1] <= 68'b0;
s2_pp[2] <= 68'b0;
s2_pp[3] <= 68'b0;
s2_pp[4] <= 68'b0;
s2_pp[5] <= 68'b0;
s2_pp[6] <= 68'b0;
s2_pp[7] <= 68'b0;
s2_pp[8] <= 68'b0;
s2_pp[9] <= 68'b0;
s2_pp[10] <= 68'b0;
s2_pp[11] <= 68'b0;
s2_pp[12] <= 68'b0;
s2_pp[13] <= 68'b0;
s2_pp[14] <= 68'b0;
s2_pp[15] <= 68'b0;
s2_pp[16] <= 68'b0;
end else begin
s2_valid <= s1_valid;
s2_op <= s1_op;
s2_rd <= s1_rd;
s2_canceled <= s1_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s1_rd) && s1_valid);
s2_pp[0] <= pp_comb[0];
s2_pp[1] <= pp_comb[1];
s2_pp[2] <= pp_comb[2];
s2_pp[3] <= pp_comb[3];
s2_pp[4] <= pp_comb[4];
s2_pp[5] <= pp_comb[5];
s2_pp[6] <= pp_comb[6];
s2_pp[7] <= pp_comb[7];
s2_pp[8] <= pp_comb[8];
s2_pp[9] <= pp_comb[9];
s2_pp[10] <= pp_comb[10];
s2_pp[11] <= pp_comb[11];
s2_pp[12] <= pp_comb[12];
s2_pp[13] <= pp_comb[13];
s2_pp[14] <= pp_comb[14];
s2_pp[15] <= pp_comb[15];
s2_pp[16] <= pp_comb[16];
end
end

// ======================== Stage 3: Wallace Tree 第一层 ========================
// 17个部分积 → 12个 (使用5个CSA)
logic s3_valid;
logic [ 1:0] s3_op;
logic [ 4:0] s3_rd;
logic s3_canceled;
logic [67:0] s3_pp [11:0];

// CSA 第一层输出信号
logic [67:0] w3_sum [ 4:0];
logic [67:0] w3_carry [ 4:0];

`CSA_3_2(s2_pp[0], s2_pp[1], s2_pp[2], w3_sum[0], w3_carry[0])
`CSA_3_2(s2_pp[3], s2_pp[4], s2_pp[5], w3_sum[1], w3_carry[1])
`CSA_3_2(s2_pp[6], s2_pp[7], s2_pp[8], w3_sum[2], w3_carry[2])
`CSA_3_2(s2_pp[9], s2_pp[10], s2_pp[11], w3_sum[3], w3_carry[3])
`CSA_3_2(s2_pp[12], s2_pp[13], s2_pp[14], w3_sum[4], w3_carry[4])

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s3_valid <= 1'b0;
s3_op <= 2'b0;
s3_rd <= 5'b0;
s3_canceled <= 1'b0;
s3_pp[0] <= 68'b0;
s3_pp[1] <= 68'b0;
s3_pp[2] <= 68'b0;
s3_pp[3] <= 68'b0;
s3_pp[4] <= 68'b0;
s3_pp[5] <= 68'b0;
s3_pp[6] <= 68'b0;
s3_pp[7] <= 68'b0;
s3_pp[8] <= 68'b0;
s3_pp[9] <= 68'b0;
s3_pp[10] <= 68'b0;
s3_pp[11] <= 68'b0;
end else begin
s3_valid <= s2_valid;
s3_op <= s2_op;
s3_rd <= s2_rd;
s3_canceled <= s2_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s2_rd) && s2_valid);
s3_pp[0] <= w3_sum[0];
s3_pp[1] <= w3_carry[0];
s3_pp[2] <= w3_sum[1];
s3_pp[3] <= w3_carry[1];
s3_pp[4] <= w3_sum[2];
s3_pp[5] <= w3_carry[2];
s3_pp[6] <= w3_sum[3];
s3_pp[7] <= w3_carry[3];
s3_pp[8] <= w3_sum[4];
s3_pp[9] <= w3_carry[4];
s3_pp[10] <= s2_pp[15];
s3_pp[11] <= s2_pp[16];
end
end

// ======================== Stage 4: Wallace Tree 第二层 ========================
// 12 → 8 → 6
logic s4_valid;
logic [ 1:0] s4_op;
logic [ 4:0] s4_rd;
logic s4_canceled;
logic [67:0] s4_pp [5:0];

// 第一轮: 12 → 8
logic [67:0] w4a_sum [3:0];
logic [67:0] w4a_carry [3:0];

`CSA_3_2(s3_pp[0], s3_pp[1], s3_pp[2], w4a_sum[0], w4a_carry[0])
`CSA_3_2(s3_pp[3], s3_pp[4], s3_pp[5], w4a_sum[1], w4a_carry[1])
`CSA_3_2(s3_pp[6], s3_pp[7], s3_pp[8], w4a_sum[2], w4a_carry[2])
`CSA_3_2(s3_pp[9], s3_pp[10], s3_pp[11], w4a_sum[3], w4a_carry[3])

// 第二轮: 8 → 6
logic [67:0] w4b_sum [1:0];
logic [67:0] w4b_carry[1:0];

`CSA_3_2(w4a_sum[0], w4a_carry[0], w4a_sum[1], w4b_sum[0], w4b_carry[0])
`CSA_3_2(w4a_carry[1], w4a_sum[2], w4a_carry[2], w4b_sum[1], w4b_carry[1])

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s4_valid <= 1'b0;
s4_op <= 2'b0;
s4_rd <= 5'b0;
s4_canceled <= 1'b0;
s4_pp[0] <= 68'b0;
s4_pp[1] <= 68'b0;
s4_pp[2] <= 68'b0;
s4_pp[3] <= 68'b0;
s4_pp[4] <= 68'b0;
s4_pp[5] <= 68'b0;
end else begin
s4_valid <= s3_valid;
s4_op <= s3_op;
s4_rd <= s3_rd;
s4_canceled <= s3_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s3_rd) && s3_valid);
s4_pp[0] <= w4b_sum[0];
s4_pp[1] <= w4b_carry[0];
s4_pp[2] <= w4b_sum[1];
s4_pp[3] <= w4b_carry[1];
s4_pp[4] <= w4a_sum[3];
s4_pp[5] <= w4a_carry[3];
end
end

// ======================== Stage 5: Wallace Tree 第三层 ========================
// 6 → 4 → 3 → 2
logic s5_valid;
logic [ 1:0] s5_op;
logic [ 4:0] s5_rd;
logic s5_canceled;
logic [67:0] s5_sum;
logic [67:0] s5_carry;

// 6 → 4
logic [67:0] w5a_sum [1:0];
logic [67:0] w5a_carry [1:0];

`CSA_3_2(s4_pp[0], s4_pp[1], s4_pp[2], w5a_sum[0], w5a_carry[0])
`CSA_3_2(s4_pp[3], s4_pp[4], s4_pp[5], w5a_sum[1], w5a_carry[1])

// 4 → 3
logic [67:0] w5b_sum;
logic [67:0] w5b_carry;

`CSA_3_2(w5a_sum[0], w5a_carry[0], w5a_sum[1], w5b_sum, w5b_carry)

// 3 → 2
logic [67:0] w5c_sum;
logic [67:0] w5c_carry;

`CSA_3_2(w5b_sum, w5b_carry, w5a_carry[1], w5c_sum, w5c_carry)

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s5_valid <= 1'b0;
s5_op <= 2'b0;
s5_rd <= 5'b0;
s5_canceled <= 1'b0;
s5_sum <= 68'b0;
s5_carry <= 68'b0;
end else begin
s5_valid <= s4_valid;
s5_op <= s4_op;
s5_rd <= s4_rd;
s5_canceled <= s4_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s4_rd) && s4_valid);
s5_sum <= w5c_sum;
s5_carry <= w5c_carry;
end
end

// ======================== Stage 6: 最终CPA加法 ========================
logic s6_valid;
logic [ 1:0] s6_op;
logic [ 4:0] s6_rd;
logic s6_canceled;
logic [63:0] s6_product;

// 最终64位加法
logic [63:0] final_product;
assign final_product = s5_sum[63:0] + s5_carry[63:0];

always_ff @(posedge clk or negedge rst_n) begin
if (!rst_n || flush_i) begin
s6_valid <= 1'b0;
s6_op <= 2'b0;
s6_rd <= 5'b0;
s6_canceled <= 1'b0;
s6_product <= 64'b0;
end else begin
s6_valid <= s5_valid;
s6_op <= s5_op;
s6_rd <= s5_rd;
s6_canceled <= s5_canceled ||
((cancel_rd_i != 5'b0) && (cancel_rd_i == s5_rd) && s5_valid);
s6_product <= final_product;
end
end

// ======================== 输出逻辑 ========================
always_comb begin
case (s6_op)
MUL_OP_MUL: mul_result_o = s6_product[31:0];
MUL_OP_MULH, MUL_OP_MULHSU, MUL_OP_MULHU: mul_result_o = s6_product[63:32];
default: mul_result_o = 32'b0;
endcase
end

assign mul_valid_o = s6_valid && !s6_canceled;
assign mul_rd_o = s6_rd;

logic s6_cancel_current_cycle;
assign s6_cancel_current_cycle = (cancel_rd_i != 5'b0) && (cancel_rd_i == s6_rd);
assign mul_rf_we_o = s6_valid && !s6_canceled && !s6_cancel_current_cycle && (s6_rd != 5'b0);

// 状态信号
always_comb begin
mul_stage_busy_o = {s6_valid, s5_valid, s4_valid, s3_valid, s2_valid, s1_valid};
mul_busy_o = |mul_stage_busy_o;
end

assign mul_rd_s_o = {s6_rd, s5_rd, s4_rd, s3_rd, s2_rd, s1_rd};

// 取消宏定义
`undef CSA_3_2

endmodule

然后同步修改CPU_TOP.sv

1
2
3
4
localparam int unsigned MUL_STAGE = 6;  // 乘法器流水线级数
logic [MUL_STAGE-1:0] mul_stage_busy;
logic [MUL_STAGE-1:0][4:0] mul_rd_s;
logic [4:0] mul_cancel_rd;

再改一下HazardUnit,改成参数化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
module HazardUnit #(
parameter integer MUL_STAGE = 4 // 乘法器流水线级数
) (
input logic [ MUL_STAGE-1:0] mul_stage_busy, // 乘法器各级流水线忙状态
input logic [ MUL_STAGE-1:0][4:0] mul_rd_s, // 乘法器各级流水线目标寄存器地址
// ...
)
logic [MUL_STAGE:0][4:0] mul_rd_all;
logic [MUL_STAGE:0] mul_vld_all;
assign mul_rd_all = {mul_rd_s, wR_EX};
assign mul_vld_all = {mul_stage_busy, is_mul_instr_EX};

// Debug/可视化向量(可在波形里直接看每一级是否命中)
logic [MUL_STAGE:0] mul_raw_hit_r1;
logic [MUL_STAGE:0] mul_raw_hit_r2;
logic [MUL_STAGE:0] id_reads_mul_rd;
logic [MUL_STAGE:0] mul_waw_conflict;
for (int i = 0; i < MUL_STAGE + 1; i++) begin
// ...
end
endmodule