Files
learnFPGAProject/RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IMF.v
2025-11-27 04:28:54 +03:00

1163 lines
44 KiB
Verilog

/******************************************************************************/
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
// TestDrive: morphing tachyon into a RV32IMF core, trying to
// preserve maxfreq at each step.
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
// Step 3: RV32F decod only valid. fmax: 100-105 MHz exp. fmax: 105 MHz
//
/******************************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32imaf"
`define NRV_ABI "ilp32f"
//`define NRV_ARCH "rv32im"
//`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
// Check condition and display message in simulation
`ifdef BENCH
`define ASSERT(cond,msg) if(!(cond)) $display msg
`define ASSERT_NOT_REACHED(msg) $display msg
`else
`define ASSERT(cond,msg)
`define ASSERT_NOT_REACHED(msg)
`endif
// FPU Normalization needs to detect the position of the first bit set
// in the A_frac register. It is easier to count the number of leading
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
// https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count
module CLZ #(
parameter W_IN = 64, // must be power of 2, >= 2
parameter W_OUT = $clog2(W_IN)
) (
input wire [W_IN-1:0] in,
output wire [W_OUT-1:0] out
);
generate
if(W_IN == 2) begin
assign out = !in[1];
end else begin
wire [W_OUT-2:0] half_count;
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
wire left_empty = ~|lhs;
CLZ #(
.W_IN(W_IN/2)
) inner(
.in(left_empty ? rhs : lhs),
.out(half_count)
);
assign out = {left_empty, half_count};
end
endgenerate
endmodule
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *) reg [7:0] funct3Is;
// Instruction decoder and immediate decoder
// Base RISC-V (RV32I) has only 10 different instructions !
reg isLoad, isALUimm, isAUIPC, isStore, isALUreg, isLUI,
isBranch, isJALR, isJAL, isSYSTEM, isFPU;
reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm;
reg rdIsNZ; // Asserted if dest. register is non-zero (writeback)
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
isLoad <= (mem_rdata[6:3] == 4'b0000); // rd <- mem[rs1+Iimm]
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
isStore <= (mem_rdata[6:3] == 4'b0100); // mem[rs1+Simm] <- rs2
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
isFPU <= (mem_rdata[6:5] == 2'b10); // all FPU except FLW/FSW
funct3Is <= 8'b00000001 << mem_rdata[14:12];
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
rdIsNZ <= |mem_rdata[11:7];
end
end
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] rs3; // this one is used by the FMA instructions.
reg [31:0] registerFile [0:63]; // 0..31: integer registers
// 32..63: floating-point registers
/***************************************************************************/
// The FPU
/***************************************************************************/
// instruction decoder
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD, isFADD, isFSUB, isFMUL, isFDIV,
isFSQRT, isFSGNJ, isFSGNJN, isFSGNJX, isFMIN, isFMAX, isFEQ, isFLT,
isFLE, isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW,
isFMVWX;
reg rdIsFP; // Asserted if destination register is a FP register.
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire rs1IsFP = (mem_rdata[6:5] == 2'b10 ) &&
!((mem_rdata[4:2] == 3'b100) && (
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
)
);
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001);
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
isFMADD <= (mem_rdata[4:2] == 3'b000);
isFMSUB <= (mem_rdata[4:2] == 3'b001);
isFNMSUB <= (mem_rdata[4:2] == 3'b010);
isFNMADD <= (mem_rdata[4:2] == 3'b011);
isFADD <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000));
isFSUB <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001));
isFMUL <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010));
isFDIV <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011));
isFSQRT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011));
isFSGNJ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00));
isFSGNJN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01));
isFSGNJX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10));
isFMIN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]);
isFMAX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && mem_rdata[12]);
isFEQ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10));
isFLT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01));
isFLE <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00));
isFCLASS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && mem_rdata[12]);
isFCVTWS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]);
isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && mem_rdata[20]);
isFCVTSW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]);
isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && mem_rdata[20]);
isFMVXW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]);
isFMVWX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110));
rdIsFP <= (mem_rdata[6:2] == 5'b00001) || // FLW
(mem_rdata[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
(mem_rdata[6:4] == 3'b101 && (
(mem_rdata[31] == 1'b0) || // R-Type FPU
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
)
);
end
end
// FPU output = 32 MSBs of A register (see below)
// A macro to easily write to it (`FPU_OUT <= ...),
// used when FPU output is an integer.
`define FPU_OUT {A_sign, A_exp[7:0], A_frac[46:24]}
wire [31:0] fpuOut = `FPU_OUT;
// Two temporary 32-bit registers used by FDIV and FSQRT
reg [31:0] tmp1;
reg [31:0] tmp2;
// Expand the source registers into sign, exponent and fraction.
// Normalized, first bit set is bit 23 (addditional bit), or zero.
// For now, flush all denormals to zero
// TODO: denormals and infinities
// Following IEEE754, represented number is +/- frac * 2^(exp-127-23)
// (127: bias 23: position of first bit set for normalized numbers)
wire rs1_sign = rs1[31];
wire [7:0] rs1_exp = rs1[30:23];
wire [23:0] rs1_frac = rs1_exp == 8'd0 ? 24'b0 : {1'b1, rs1[22:0]};
wire rs2_sign = rs2[31];
wire [7:0] rs2_exp = rs2[30:23];
wire [23:0] rs2_frac = rs2_exp == 8'd0 ? 24'b0 : {1'b1, rs2[22:0]};
wire rs3_sign = rs3[31];
wire [7:0] rs3_exp = rs3[30:23];
wire [23:0] rs3_frac = rs3_exp == 8'd0 ? 24'b0 : {1'b1, rs3[22:0]};
// Two high-resolution registers
// Register A has the accumulator / shifters / leading zero counter
// Normalized if first bit set is bit 47
// Represented number is +/- frac * 2^(exp-127-47)
reg A_sign;
reg signed [8:0] A_exp;
reg signed [49:0] A_frac;
reg B_sign;
reg signed [8:0] B_exp;
reg signed [49:0] B_frac;
// ******************* Comparisons ******************************************
// Exponent adder
wire signed [8:0] exp_sum = B_exp + A_exp;
wire signed [8:0] exp_diff = B_exp - A_exp;
wire expA_EQ_expB = (exp_diff == 0);
wire fracA_EQ_fracB = (frac_diff == 0);
wire fabsA_EQ_fabsB = (expA_EQ_expB && fracA_EQ_fracB);
wire fabsA_LT_fabsB = (!exp_diff[8] && !expA_EQ_expB) ||
(expA_EQ_expB && !fracA_EQ_fracB && !frac_diff[50]);
wire fabsA_LE_fabsB = (!exp_diff[8] && !expA_EQ_expB) ||
(expA_EQ_expB && !frac_diff[50]);
wire fabsB_LT_fabsA = exp_diff[8] || (expA_EQ_expB && frac_diff[50]);
wire fabsB_LE_fabsA = exp_diff[8] ||
(expA_EQ_expB && (frac_diff[50] || fracA_EQ_fracB));
wire A_LT_B = A_sign && !B_sign ||
A_sign && B_sign && fabsB_LT_fabsA ||
!A_sign && !B_sign && fabsA_LT_fabsB ;
wire A_LE_B = A_sign && !B_sign ||
A_sign && B_sign && fabsB_LE_fabsA ||
!A_sign && !B_sign && fabsA_LE_fabsB ;
wire A_EQ_B = fabsA_EQ_fabsB && (A_sign == B_sign);
// ****************** Addition, subtraction *********************************
wire signed [50:0] frac_sum = B_frac + A_frac;
wire signed [50:0] frac_diff = B_frac - A_frac;
// ****************** Product ***********************************************
wire [49:0] prod_frac = rs1_frac * rs2_frac; // TODO: check overflows
// exponent of product, once normalized
// (obtained by writing expression of product and inspecting exponent)
// Two cases: first bit set = 47 or 46 (only possible cases with normals)
wire signed [8:0] prod_exp_norm = rs1_exp+rs2_exp-127+{7'b0,prod_frac[47]};
// detect null product and underflows (all denormals are flushed to zero)
wire prod_Z = (prod_exp_norm <= 0) || !(|prod_frac[47:46]);
// ****************** Normalization *****************************************
// Count leading zeroes in A
// Note1: CLZ only work with power of two width (hence 14'b0).
// Note2: first bit set = 63 - CLZ (of course !)
wire [5:0] A_clz;
CLZ clz({14'b0,A_frac}, A_clz);
// Exponent of A once normalized = A_exp + first_bit_set - 47
// = A_exp + 63 - clz - 47 = A_exp + 16 - clz
wire signed [8:0] A_exp_norm = A_exp + 16 - {3'b000,A_clz};
// ****************** Reciprocal (1/x), used by FDIV ************************
// Exponent for reciprocal (1/x)
// Initial value of x kept in tmp2.
wire signed [8:0] frcp_exp = 9'd126 + A_exp - $signed({1'b0, tmp2[30:23]});
// ****************** Reciprocal square root (1/sqrt(x)) ********************
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
wire [31:0] rsqrt_doom_magic = 32'h5f3759df - {1'b0,rs1[30:1]};
// ****************** Float to Integer conversion ***************************
// -127-23 is standard exponent bias
// -6 because it is bit 29 of rs1 that corresponds to bit 47 of A_frac,
// instead of bit 23 (and 23-29 = -6).
wire signed [8:0] fcvt_ftoi_shift = rs1_exp - 9'd127 - 9'd23 - 9'd6;
wire signed [8:0] neg_fcvt_ftoi_shift = -fcvt_ftoi_shift;
wire [31:0] A_fcvt_ftoi_shifted = fcvt_ftoi_shift[8] ? // R or L shift
(|neg_fcvt_ftoi_shift[8:5] ? 0 : // underflow
({A_frac[49:18]} >> neg_fcvt_ftoi_shift[4:0])) :
({A_frac[49:18]} << fcvt_ftoi_shift[4:0]);
// ******************* Classification ***************************************
wire rs1_exp_Z = (rs1_exp == 0 );
wire rs1_exp_255 = (rs1_exp == 255);
wire rs1_frac_Z = (rs1_frac == 0 );
wire [31:0] fclass = {
22'b0,
rs1_exp_255 & rs1_frac[22], // 9: quiet NaN
rs1_exp_255 & !rs1_frac[22] & (|rs1_frac[21:0]), // 8: sig NaN
!rs1_sign & rs1_exp_255 & rs1_frac_Z, // 7: +infinity
!rs1_sign & !rs1_exp_Z & !rs1_exp_255, // 6: +normal
!rs1_sign & rs1_exp_Z & !rs1_frac_Z, // 5: +subnormal
!rs1_sign & rs1_exp_Z & rs1_frac_Z, // 4: +0
rs1_sign & rs1_exp_Z & rs1_frac_Z, // 3: -0
rs1_sign & rs1_exp_Z & !rs1_frac_Z, // 2: -subnormal
rs1_sign & !rs1_exp_Z & !rs1_exp_255, // 1: -normal
rs1_sign & rs1_exp_255 & rs1_frac_Z // 0: -infinity
};
/** FPU micro-instructions *************************************************/
localparam FPMI_READY = 0;
localparam FPMI_LOAD_AB = 1; // A <- fprs1; B <- fprs2
localparam FPMI_LOAD_AB_MUL = 2; // A <- norm(fprs1*fprs2); B <- fprs3
localparam FPMI_NORM = 3; // A <- norm(A)
localparam FPMI_ADD_SWAP = 4; // if |A| > |B| swap(A,B)
localparam FPMI_ADD_SHIFT = 5; // shift A to match B exponent
localparam FPMI_ADD_ADD = 6; // A <- A + B (or A - B if FSUB)
localparam FPMI_CMP = 7; // fpuOut <- test A,B (FEQ,FLE,FLT)
localparam FPMI_MV_RS1_A = 8; // fprs1 <- A
localparam FPMI_MV_RS2_TMP1 = 9; // fprs1 <- tmp1
localparam FPMI_MV_RS2_MHTMP1 = 10; // fprs2 <- -0.5*tmp1
localparam FPMI_MV_RS2_TMP2 = 11; // fprs2 <- tmp2
localparam FPMI_MV_TMP2_A = 12; // tmp2 <- A
localparam FPMI_FRCP_PROLOG = 13; // init reciprocal (1/x)
localparam FPMI_FRCP_ITER = 14; // iteration for reciprocal
localparam FPMI_FRCP_EPILOG = 15; // epilog for reciprocal
localparam FPMI_FRSQRT_PROLOG = 16; // init recipr sqr root (1/sqrt(x))
localparam FPMI_FP_TO_INT = 17; // fpuOut <- fpoint_to_int(fprs1)
localparam FPMI_INT_TO_FP = 18; // A <- int_to_fpoint(rs1)
localparam FPMI_MIN_MAX = 19; // fpuOut <- min/max(A,B)
localparam FPMI_NB = 20;
// Instruction exit flag (if set in current micro-instr, exit microprogram)
localparam FPMI_EXIT_FLAG_bit = 1+$clog2(FPMI_NB);
localparam FPMI_EXIT_FLAG = 1 << FPMI_EXIT_FLAG_bit;
reg [6:0] fpmi_PC; // current micro-instruction pointer
reg [1+$clog2(FPMI_NB):0] fpmi_instr; // current micro-instruction
// current micro-instruction as 1-hot: fpmi_instr == NNN <=> fpmi_is[NNN]
(* onehot *)
wire [FPMI_NB-1:0] fpmi_is = 1 << fpmi_instr[$clog2(FPMI_NB):0];
initial fpmi_PC = 0;
wire fpuBusy = !fpmi_is[FPMI_READY];
// micro-program ROM (wired
// as a combinatorial function).
always @(*) begin
case(fpmi_PC)
0: fpmi_instr = FPMI_READY;
// FLT, FLE, FEQ
1: fpmi_instr = FPMI_LOAD_AB;
2: fpmi_instr = FPMI_CMP |
FPMI_EXIT_FLAG;
// FADD, FSUB
3: fpmi_instr = FPMI_LOAD_AB; // A <- fprs1, B <- fprs2
4: fpmi_instr = FPMI_ADD_SWAP; // if(|A| > |B|) swap(A,B)
5: fpmi_instr = FPMI_ADD_SHIFT; // shift A according to B exp
6: fpmi_instr = FPMI_ADD_ADD; // A <- A + B ( or A - B if FSUB)
7: fpmi_instr = FPMI_NORM | // A <- normalize(A)
FPMI_EXIT_FLAG;
// FMUL
8: fpmi_instr = FPMI_LOAD_AB_MUL | // A <- normalize(fprs1*fprs2)
FPMI_EXIT_FLAG;
// FMADD, FMSUB, FNMADD, FNMSUB
9: fpmi_instr = FPMI_LOAD_AB_MUL; // A <- norm(fprs1*fprs2), B <- fprs3
10: fpmi_instr = FPMI_ADD_SWAP; // if(|A| > |B|) swap(A,B)
11: fpmi_instr = FPMI_ADD_SHIFT; // shift A according to B exp
12: fpmi_instr = FPMI_ADD_ADD; // A <- A + B ( or A - B if FSUB)
13: fpmi_instr = FPMI_NORM | // A <- normalize(A)
FPMI_EXIT_FLAG;
// FDIV
// using Newton-Raphson:
// https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
// STEP 1 : D' <- fprs2 normalized between [0.5,1] (set exp to 126)
// A <- -D'*32/17 + 48/17
// STEP 2,3: A <- A * (-A*D+2) (two iterations)
// STEP 4 : A <- fprs1 * A
14: fpmi_instr = FPMI_FRCP_PROLOG; // STEP 1: A <- -D'*32/17 + 48/17
15: fpmi_instr = FPMI_LOAD_AB_MUL; // ---
16: fpmi_instr = FPMI_ADD_SWAP; // |
17: fpmi_instr = FPMI_ADD_SHIFT; // FMADD
18: fpmi_instr = FPMI_ADD_ADD; // |
19: fpmi_instr = FPMI_NORM; // ---
20: fpmi_instr = FPMI_FRCP_ITER; // STEP 2: A <- A * (-A*D + 2)
21: fpmi_instr = FPMI_LOAD_AB_MUL; // ---
22: fpmi_instr = FPMI_ADD_SWAP; // |
23: fpmi_instr = FPMI_ADD_SHIFT; // FMADD
24: fpmi_instr = FPMI_ADD_ADD; // |
25: fpmi_instr = FPMI_NORM; // ---
26: fpmi_instr = FPMI_MV_RS1_A; //
27: fpmi_instr = FPMI_LOAD_AB_MUL; // FMUL
28: fpmi_instr = FPMI_FRCP_ITER; // STEP 3: A <- A * (-A*D + 2)
29: fpmi_instr = FPMI_LOAD_AB_MUL; // ---
30: fpmi_instr = FPMI_ADD_SWAP; // |
31: fpmi_instr = FPMI_ADD_SHIFT; // FMADD
32: fpmi_instr = FPMI_ADD_ADD; // |
33: fpmi_instr = FPMI_NORM; // ---
34: fpmi_instr = FPMI_MV_RS1_A; //
35: fpmi_instr = FPMI_LOAD_AB_MUL; // FMUL
36: fpmi_instr = FPMI_FRCP_EPILOG; // STEP 4: A <- fprs1^(-1) * fprs2
37: fpmi_instr = FPMI_LOAD_AB_MUL | // FMUL
FPMI_EXIT_FLAG;
// FCVT.W.S, FCVT.WU.S
38: fpmi_instr = FPMI_LOAD_AB;
39: fpmi_instr = FPMI_FP_TO_INT |
FPMI_EXIT_FLAG;
// FCVT.S.W, FCVT.S.WU
40: fpmi_instr = FPMI_INT_TO_FP;
41: fpmi_instr = FPMI_NORM |
FPMI_EXIT_FLAG;
// FSQRT
// Using Doom's fast inverse square root algorithm:
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
// STEP 1 : A <- doom_magic - (A >> 1)
// STEP 2,3: A <- A * (3/2 - (fprs1/2 * A * A))
42: fpmi_instr = FPMI_FRSQRT_PROLOG;
43: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL
44: fpmi_instr = FPMI_MV_RS1_A;
45: fpmi_instr = FPMI_MV_RS2_MHTMP1;
46: fpmi_instr = FPMI_LOAD_AB_MUL; // ---
47: fpmi_instr = FPMI_ADD_SWAP; // |
48: fpmi_instr = FPMI_ADD_SHIFT; // FMADD
49: fpmi_instr = FPMI_ADD_ADD; // |
50: fpmi_instr = FPMI_NORM; // ---
51: fpmi_instr = FPMI_MV_RS1_A;
52: fpmi_instr = FPMI_MV_RS2_TMP2;
53: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL
54: fpmi_instr = FPMI_MV_TMP2_A;
55: fpmi_instr = FPMI_MV_RS1_A;
56: fpmi_instr = FPMI_MV_RS2_TMP2;
57: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL
58: fpmi_instr = FPMI_MV_RS1_A;
59: fpmi_instr = FPMI_MV_RS2_MHTMP1;
60: fpmi_instr = FPMI_LOAD_AB_MUL; // ---
61: fpmi_instr = FPMI_ADD_SWAP; // |
62: fpmi_instr = FPMI_ADD_SHIFT; // FMADD
63: fpmi_instr = FPMI_ADD_ADD; // |
64: fpmi_instr = FPMI_NORM; // ---
65: fpmi_instr = FPMI_MV_RS1_A;
66: fpmi_instr = FPMI_MV_RS2_TMP2;
67: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL
68: fpmi_instr = FPMI_MV_RS1_A;
69: fpmi_instr = FPMI_MV_RS2_TMP1;
70: fpmi_instr = FPMI_LOAD_AB_MUL | // -- FMUL
FPMI_EXIT_FLAG;
// FMIN, FMAX
71: fpmi_instr = FPMI_LOAD_AB;
72: fpmi_instr = FPMI_MIN_MAX |
FPMI_EXIT_FLAG ;
default: begin
`ASSERT_NOT_REACHED(("Invalid microcode address: %d",fpmi_PC));
fpmi_instr = 7'bXXXXXXX;
end
endcase
end
// micro-programs
localparam FPMPROG_CMP = 1;
localparam FPMPROG_ADD = 3;
localparam FPMPROG_MUL = 8;
localparam FPMPROG_MADD = 9;
localparam FPMPROG_DIV = 14;
localparam FPMPROG_TO_INT = 38;
localparam FPMPROG_INT_TO_FP = 40;
localparam FPMPROG_SQRT = 42;
localparam FPMPROG_MIN_MAX = 71;
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
// Fetch registers as soon as instruction is ready.
rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}];
rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}];
rs3 <= registerFile[{1'b1, mem_rdata[31:27]}];
end else if(state[EXECUTE2_bit] & isFPU) begin
// Execute single-cycle intructions and call micro-program
// for micro-programmed ones.
(* parallel_case *)
case(1'b1)
// Single-cycle instructions
isFSGNJ : `FPU_OUT <= { rs2[31], rs1[30:0]};
isFSGNJN : `FPU_OUT <= { !rs2[31], rs1[30:0]};
isFSGNJX : `FPU_OUT <= { rs1[31]^rs2[31], rs1[30:0]};
isFCLASS : `FPU_OUT <= fclass;
isFMVXW | isFMVWX : `FPU_OUT <= rs1;
// Micro-programmed instructions
isFLT | isFLE | isFEQ : fpmi_PC <= FPMPROG_CMP;
isFADD | isFSUB : fpmi_PC <= FPMPROG_ADD;
isFMUL : fpmi_PC <= FPMPROG_MUL;
isFMADD | isFMSUB | isFNMADD | isFNMSUB : fpmi_PC <= FPMPROG_MADD;
isFDIV : fpmi_PC <= FPMPROG_DIV;
isFSQRT : fpmi_PC <= FPMPROG_SQRT;
isFCVTWS | isFCVTWUS : fpmi_PC <= FPMPROG_TO_INT;
isFCVTSW | isFCVTSWU : fpmi_PC <= FPMPROG_INT_TO_FP;
isFMIN | isFMAX : fpmi_PC <= FPMPROG_MIN_MAX;
endcase
`ifdef VERILATORXXX
(* parallel_case *)
case(1'b1)
isFMADD : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")");
isFMSUB : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")");
isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")");
isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")");
isFMUL : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")");
isFADD : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")");
isFSUB : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")");
isFDIV : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")");
isFSQRT : `FPU_OUT <= $c32("FSQRT(",rs1,")");
isFSGNJ : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")");
isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")");
isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")");
isFMIN : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")");
isFMAX : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")");
isFEQ : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")");
isFLE : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")");
isFLT : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")");
isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ;
isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")");
isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")");
isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")");
isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")");
isFMVXW: `FPU_OUT <= rs1;
isFMVWX: `FPU_OUT <= rs1;
endcase
`endif
end else if(fpuBusy) begin
// Increment micro-program counter.
fpmi_PC <= fpmi_instr[FPMI_EXIT_FLAG_bit] ? 0 : fpmi_PC+1;
// Implementation of the micro-instructions
(* parallel_case *)
case(1'b1)
// A <- rs1 ; B <- rs2
fpmi_is[FPMI_LOAD_AB]: begin
A_sign <= rs1_sign;
A_frac <= {2'b0, rs1_frac, 24'd0};
A_exp <= {1'b0, rs1_exp};
B_sign <= rs2_sign ^ isFSUB;
B_frac <= {2'b0, rs2_frac, 24'd0};
B_exp <= {1'b0, rs2_exp};
end
// A <- (+/-) normalize(rs1*rs2); B <- (+/-)rs3
fpmi_is[FPMI_LOAD_AB_MUL]: begin
A_sign <= rs1_sign ^ rs2_sign ^ (isFNMSUB | isFNMADD);
A_frac <= prod_Z ? 0 :
(prod_frac[47] ? prod_frac : {prod_frac[48:0],1'b0});
A_exp <= prod_Z ? 0 : prod_exp_norm;
B_sign <= rs3_sign ^ (isFMSUB | isFNMADD);
B_frac <= {2'b0, rs3_frac, 24'd0};
B_exp <= {1'b0, rs3_exp};
end
// A <- normalize(A)
fpmi_is[FPMI_NORM]: begin
if(A_exp_norm <= 0 || (A_frac == 0)) begin
A_frac <= 0;
A_exp <= 0;
end else begin
// left shamt = 47 - first_bit_set = A_clz - 16
// (reminder: first_bit_set = 63 - A_clz)
`ASSERT(
63 - A_clz <= 48, ("NORM: first bit set = %d\n",63-A_clz)
);
A_frac <= A_frac[48] ? (A_frac >> 1) : A_frac << (A_clz - 16);
A_exp <= A_exp_norm;
end
end
// if(|A| > |B|) swap(A,B)
fpmi_is[FPMI_ADD_SWAP]: begin
if(fabsB_LT_fabsA) begin
A_frac <= B_frac; B_frac <= A_frac;
A_exp <= B_exp; B_exp <= A_exp;
A_sign <= B_sign; B_sign <= A_sign;
end
end
// shift A in order to make it match B exponent
fpmi_is[FPMI_ADD_SHIFT]: begin
`ASSERT(!fabsB_LT_fabsA, ("ADD_SHIFT: incorrect order"));
A_frac <= (exp_diff > 47) ? 0 : (A_frac >> exp_diff[5:0]);
A_exp <= B_exp;
end
// A <- A (+/-) B
fpmi_is[FPMI_ADD_ADD]: begin
A_frac <= (A_sign ^ B_sign) ? frac_diff[49:0] : frac_sum[49:0];
A_sign <= B_sign;
end
// A <- result of comparison between A and B
fpmi_is[FPMI_CMP]: begin
`FPU_OUT <= { 31'b0,
isFLT && A_LT_B ||
isFLE && A_LE_B ||
isFEQ && A_EQ_B
};
end
fpmi_is[FPMI_MV_RS2_TMP1] : rs2 <= tmp1;
fpmi_is[FPMI_MV_RS2_TMP2] : rs2 <= tmp2;
fpmi_is[FPMI_MV_RS1_A] : rs1 <= {A_sign,A_exp[7:0],A_frac[46:24]};
fpmi_is[FPMI_MV_TMP2_A] : tmp2 <= {A_sign,A_exp[7:0],A_frac[46:24]};
// rs2 <= -|tmp1| / 2.0
fpmi_is[FPMI_MV_RS2_MHTMP1]:rs2<={1'b1,tmp1[30:23]-8'd1,tmp1[22:0]};
fpmi_is[FPMI_FRCP_PROLOG]: begin
tmp1 <= rs1;
tmp2 <= rs2;
// rs1 <= -D', that is, -(fprs2 normalized in [0.5,1])
rs1 <= {1'b1, 8'd126, rs2_frac[22:0]};
rs2 <= 32'h3FF0F0F1; // 32/17
rs3 <= 32'h4034B4B5; // 48/17
end
fpmi_is[FPMI_FRCP_ITER]: begin
rs1 <= {1'b1, 8'd126, tmp2[22:0]}; // -D'
rs2 <= {A_sign, A_exp[7:0], A_frac[46:24]}; // A
rs3 <= 32'h40000000; // 2.0
end
fpmi_is[FPMI_FRCP_EPILOG]: begin
rs1 <= {tmp2[31], frcp_exp[7:0], A_frac[46:24]};
rs2 <= tmp1;
end
fpmi_is[FPMI_FRSQRT_PROLOG]: begin
tmp1 <= rs1;
tmp2 <= rsqrt_doom_magic;
rs1 <= rsqrt_doom_magic;
rs2 <= rsqrt_doom_magic;
rs3 <= 32'h3fc00000; // 1.5
end
fpmi_is[FPMI_FP_TO_INT]: begin
// TODO: check overflow
`FPU_OUT <=
(isFCVTWUS | !A_sign) ? A_fcvt_ftoi_shifted
: -$signed(A_fcvt_ftoi_shifted);
end
fpmi_is[FPMI_INT_TO_FP]: begin
// TODO: rounding
A_frac <= (isFCVTSWU | !rs1[31]) ? {rs1, 18'd0}
: {-$signed(rs1), 18'd0};
A_sign <= isFCVTSW & rs1[31];
// 127+23: standard exponent bias
// +6 because it is bit 29 of rs1 that overwrites
// bit 47 of A_frac, instead of bit 23 (and 29-23 = 6).
A_exp <= 127+23+6;
end
fpmi_is[FPMI_MIN_MAX]: begin
`FPU_OUT <= (A_LT_B ^ isFMAX)
? {A_sign, A_exp[7:0], A_frac[46:24]}
: {B_sign, B_exp[7:0], B_frac[46:24]};
end
endcase
// register write-back
end else if(
!(isBranch | isStore) & (rdIsFP | rdIsNZ) &
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit])
) begin
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
end
end
`ifdef VERILATOR
// When doing simulations, compare the result of all operations with
// what's computed on the host CPU.
reg [31:0] z;
reg [31:0] rs1_bkp;
reg [31:0] rs2_bkp;
reg [31:0] rs3_bkp;
always @(posedge clk) begin
// Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and
// rs3 as temporaty registers, so we need to save them to be able
// to recompute the operation on the host CPU.
if(isFPU && state[EXECUTE2_bit]) begin
rs1_bkp <= rs1;
rs2_bkp <= rs2;
rs3_bkp <= rs3;
end
if(
isFPU && state[WAIT_ALU_OR_MEM_bit] && fpmi_PC == 0
) begin
case(1'b1)
isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")");
isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")");
isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")");
// my FDIV and FSQRT are not IEEE754 compliant !
// (checks commented-out for now)
// Note: checks use rs1_bkp and rs2_bkp because
// FDIV and FSQRT overwrite rs1 and rs2
//
//isFDIV:
// z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")");
//isFSQRT:
// z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")");
isFMADD :
z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFMSUB :
z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFNMSUB:
z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFNMADD:
z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")");
isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")");
isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")");
isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")");
isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")");
isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")");
isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")");
isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")");
isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")");
endcase
end
end
`endif
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except DIV
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = flip32(shifter);
/***************************************************************************/
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] alu_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
// 4->DIV 5->DIVU 6->REM 7->REMU
wire [31:0] alu_mul = funct3Is[0]
? multiply[31: 0] // 0:MUL
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
: (div_sign ? -quotient : quotient);
wire aluBusy = |quotient_msk; // ALU is busy if division in progress.
reg [31:0] aluOut;
wire funcM = instr[25];
wire isDivide = instr[14];
always @(posedge clk) begin
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
end
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg div_sign;
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [32:0] quotient_msk;
always @(posedge clk) begin
if (aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk[32] <= isALUreg & funcM & isDivide;
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
end else begin
divisor <= divisor >> 1;
quotient_msk <= quotient_msk >> 1;
if(divisor <= {31'b0, dividend}) begin
quotient <= {quotient[30:0],1'b1};
dividend <= dividend - divisor[31:0];
end else begin
quotient <= {quotient[30:0],1'b0};
end
end
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate_ =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
reg predicate;
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
reg [ADDR_WIDTH-1:0] PCplusImm;
// A separate adder to compute the destination of load/store.
reg [ADDR_WIDTH-1:0] loadstore_addr;
assign mem_addr = {ADDR_PAD,
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr
};
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
/* verilator lint_off WIDTH */
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
/* verilator lint_on WIDTH */
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isFPU ? fpuOut : 32'b0) | // FPU
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0); // Load
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word (=instr[13:12])
// - mem_addr[1:0]: indicates which byte/halfword is accessed
// - instr[2] is set for FLW and FSW.
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE1_bit = 2;
localparam EXECUTE2_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam NB_STATES = 5;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE1 = 1 << EXECUTE1_bit;
localparam EXECUTE2 = 1 << EXECUTE2_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// The memory-read signal.
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE1_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) |
aluBusy | isFPU;
`else
wire needToWait = isLoad | isStore | aluBusy | isFPU;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored
state <= EXECUTE1; // also the declaration of instr).
end
end
state[EXECUTE1_bit]: begin
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to:
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// testing instr[5] is equivalent to testing isStore in this context.
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
predicate <= predicate_;
state <= EXECUTE2;
end
state[EXECUTE2_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= FETCH_INSTR;
end
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
endmodule
/*****************************************************************************/