newStep.v
This commit is contained in:
689
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM_simF.v
Normal file
689
RTL/PROCESSOR/TESTDRIVE/femtorv32_testdrive_RV32IM_simF.v
Normal file
@@ -0,0 +1,689 @@
|
||||
/******************************************************************************/
|
||||
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
|
||||
// TestDrive: morphing tachyon into a RV32IMF core, trying to
|
||||
// preserve maxfreq at each step.
|
||||
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
|
||||
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
|
||||
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
|
||||
// Step 3: RV32F decod only valid. fmax: 100-105 MHz exp. fmax: 105 MHz
|
||||
|
||||
//
|
||||
/******************************************************************************/
|
||||
|
||||
// Firmware generation flags for this processor
|
||||
`define NRV_ARCH "rv32imaf"
|
||||
`define NRV_ABI "ilp32f"
|
||||
|
||||
//`define NRV_ARCH "rv32im"
|
||||
//`define NRV_ABI "ilp32"
|
||||
|
||||
`define NRV_OPTIMIZE "-O3"
|
||||
|
||||
// Check condition and display message in simulation
|
||||
`ifdef BENCH
|
||||
`define ASSERT(cond,msg) if(!(cond)) $display msg
|
||||
`define ASSERT_NOT_REACHED(msg) $display msg
|
||||
`else
|
||||
`define ASSERT(cond,msg)
|
||||
`define ASSERT_NOT_REACHED(msg)
|
||||
`endif
|
||||
|
||||
// FPU Normalization needs to detect the position of the first bit set
|
||||
// in the A_frac register. It is easier to count the number of leading
|
||||
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
|
||||
// https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count
|
||||
module CLZ #(
|
||||
parameter W_IN = 64, // must be power of 2, >= 2
|
||||
parameter W_OUT = $clog2(W_IN)
|
||||
) (
|
||||
input wire [W_IN-1:0] in,
|
||||
output wire [W_OUT-1:0] out
|
||||
);
|
||||
generate
|
||||
if(W_IN == 2) begin
|
||||
assign out = !in[1];
|
||||
end else begin
|
||||
wire [W_OUT-2:0] half_count;
|
||||
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
|
||||
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
|
||||
wire left_empty = ~|lhs;
|
||||
CLZ #(
|
||||
.W_IN(W_IN/2)
|
||||
) inner(
|
||||
.in(left_empty ? rhs : lhs),
|
||||
.out(half_count)
|
||||
);
|
||||
assign out = {left_empty, half_count};
|
||||
end
|
||||
endgenerate
|
||||
endmodule
|
||||
|
||||
module FemtoRV32(
|
||||
input clk,
|
||||
|
||||
output [31:0] mem_addr, // address bus
|
||||
output [31:0] mem_wdata, // data to be written
|
||||
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
|
||||
input [31:0] mem_rdata, // input lines for both data and instr
|
||||
output mem_rstrb, // active to initiate memory read (used by IO)
|
||||
input mem_rbusy, // asserted if memory is busy reading value
|
||||
input mem_wbusy, // asserted if memory is busy writing value
|
||||
|
||||
input reset // set to 0 to reset the processor
|
||||
);
|
||||
|
||||
parameter RESET_ADDR = 32'h00000000;
|
||||
parameter ADDR_WIDTH = 24;
|
||||
|
||||
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
|
||||
|
||||
|
||||
// Flip a 32 bit word. Used by the shifter (a single shifter for
|
||||
// left and right shifts, saves silicium !)
|
||||
function [31:0] flip32;
|
||||
input [31:0] x;
|
||||
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
|
||||
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
|
||||
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
|
||||
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
|
||||
endfunction
|
||||
|
||||
/***************************************************************************/
|
||||
// Instruction decoding.
|
||||
/***************************************************************************/
|
||||
|
||||
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
|
||||
// Reference: Table page 104 of:
|
||||
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
|
||||
|
||||
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
|
||||
// It is used as follows: funct3Is[val] <=> funct3 == val
|
||||
(* onehot *) reg [7:0] funct3Is;
|
||||
|
||||
// Instruction decoder and immediate decoder
|
||||
// Base RISC-V (RV32I) has only 10 different instructions !
|
||||
|
||||
reg isLoad, isALUimm, isAUIPC, isStore, isALUreg, isLUI,
|
||||
isBranch, isJALR, isJAL, isSYSTEM, isFPU;
|
||||
|
||||
reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm;
|
||||
reg rdIsNZ; // Asserted if dest. register is non-zero (writeback)
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
isLoad <= (mem_rdata[6:3] == 4'b0000); // rd <- mem[rs1+Iimm]
|
||||
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
|
||||
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
|
||||
isStore <= (mem_rdata[6:3] == 4'b0100); // mem[rs1+Simm] <- rs2
|
||||
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
|
||||
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
|
||||
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm
|
||||
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
|
||||
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
|
||||
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
|
||||
isFPU <= (mem_rdata[6:5] == 2'b10); // all FPU except FLW/FSW
|
||||
funct3Is <= 8'b00000001 << mem_rdata[14:12];
|
||||
|
||||
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
|
||||
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
|
||||
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
|
||||
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
|
||||
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
|
||||
|
||||
rdIsNZ <= |mem_rdata[11:7];
|
||||
end
|
||||
end
|
||||
|
||||
wire isALU = isALUimm | isALUreg;
|
||||
|
||||
/***************************************************************************/
|
||||
// The register file.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [31:0] rs1;
|
||||
reg [31:0] rs2;
|
||||
reg [31:0] rs3; // this one is used by the FMA instructions.
|
||||
|
||||
reg [31:0] registerFile [0:63]; // 0..31: integer registers
|
||||
// 32..63: floating-point registers
|
||||
|
||||
/***************************************************************************/
|
||||
// The FPU
|
||||
/***************************************************************************/
|
||||
|
||||
// instruction decoder
|
||||
|
||||
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD, isFADD, isFSUB, isFMUL, isFDIV,
|
||||
isFSQRT, isFSGNJ, isFSGNJN, isFSGNJX, isFMIN, isFMAX, isFEQ, isFLT,
|
||||
isFLE, isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW,
|
||||
isFMVWX;
|
||||
|
||||
reg rdIsFP; // Asserted if destination register is a FP register.
|
||||
|
||||
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
|
||||
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
|
||||
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire rs1IsFP = (mem_rdata[6:5] == 2'b10 ) &&
|
||||
!((mem_rdata[4:2] == 3'b100) && (
|
||||
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
|
||||
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
|
||||
// (two versions of the signal, one for regular instruction decode,
|
||||
// the other one for compressed instructions).
|
||||
wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
isFMADD <= (mem_rdata[4:2] == 3'b000);
|
||||
isFMSUB <= (mem_rdata[4:2] == 3'b001);
|
||||
isFNMSUB <= (mem_rdata[4:2] == 3'b010);
|
||||
isFNMADD <= (mem_rdata[4:2] == 3'b011);
|
||||
|
||||
isFADD <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000));
|
||||
isFSUB <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001));
|
||||
isFMUL <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010));
|
||||
isFDIV <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011));
|
||||
isFSQRT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011));
|
||||
|
||||
isFSGNJ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00));
|
||||
isFSGNJN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01));
|
||||
isFSGNJX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10));
|
||||
|
||||
isFMIN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]);
|
||||
isFMAX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && mem_rdata[12]);
|
||||
|
||||
isFEQ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10));
|
||||
isFLT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01));
|
||||
isFLE <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00));
|
||||
|
||||
isFCLASS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && mem_rdata[12]);
|
||||
|
||||
isFCVTWS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]);
|
||||
isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && mem_rdata[20]);
|
||||
|
||||
isFCVTSW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]);
|
||||
isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && mem_rdata[20]);
|
||||
|
||||
isFMVXW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]);
|
||||
isFMVWX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110));
|
||||
|
||||
rdIsFP <= (mem_rdata[6:2] == 5'b00001) || // FLW
|
||||
(mem_rdata[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
|
||||
(mem_rdata[6:4] == 3'b101 && (
|
||||
(mem_rdata[31] == 1'b0) || // R-Type FPU
|
||||
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
|
||||
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
|
||||
)
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] fpuOut;
|
||||
`define FPU_OUT fpuOut
|
||||
wire fpuBusy = 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(state[WAIT_INSTR_bit]) begin
|
||||
// Fetch registers as soon as instruction is ready.
|
||||
rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}];
|
||||
rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}];
|
||||
rs3 <= registerFile[{1'b1, mem_rdata[31:27]}];
|
||||
end else if(state[EXECUTE2_bit] & isFPU) begin
|
||||
`ifdef VERILATOR
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
isFMADD : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")");
|
||||
isFMSUB : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")");
|
||||
isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")");
|
||||
isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFMUL : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")");
|
||||
isFADD : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")");
|
||||
isFSUB : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")");
|
||||
|
||||
isFDIV : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")");
|
||||
isFSQRT : `FPU_OUT <= $c32("FSQRT(",rs1,")");
|
||||
|
||||
|
||||
isFSGNJ : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")");
|
||||
isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")");
|
||||
isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")");
|
||||
|
||||
isFMIN : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")");
|
||||
isFMAX : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")");
|
||||
|
||||
isFEQ : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")");
|
||||
isFLE : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")");
|
||||
isFLT : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")");
|
||||
|
||||
isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ;
|
||||
|
||||
isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")");
|
||||
isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")");
|
||||
|
||||
isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")");
|
||||
isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")");
|
||||
|
||||
isFMVXW: `FPU_OUT <= rs1;
|
||||
isFMVWX: `FPU_OUT <= rs1;
|
||||
endcase
|
||||
`endif
|
||||
|
||||
// register write-back
|
||||
end else if(
|
||||
!(isBranch | isStore) & (rdIsFP | rdIsNZ) &
|
||||
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit])
|
||||
) begin
|
||||
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
`ifdef VERILATOR
|
||||
// When doing simulations, compare the result of all operations with
|
||||
// what's computed on the host CPU.
|
||||
|
||||
reg [31:0] z;
|
||||
reg [31:0] rs1_bkp;
|
||||
reg [31:0] rs2_bkp;
|
||||
reg [31:0] rs3_bkp;
|
||||
|
||||
always @(posedge clk) begin
|
||||
// Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and
|
||||
// rs3 as temporaty registers, so we need to save them to be able
|
||||
// to recompute the operation on the host CPU.
|
||||
if(isFPU && state[EXECUTE2_bit]) begin
|
||||
rs1_bkp <= rs1;
|
||||
rs2_bkp <= rs2;
|
||||
rs3_bkp <= rs3;
|
||||
end
|
||||
|
||||
if(
|
||||
isFPU && state[WAIT_ALU_OR_MEM_bit] // && fpmi_PC == 0
|
||||
) begin
|
||||
case(1'b1)
|
||||
isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
// my FDIV and FSQRT are not IEEE754 compliant !
|
||||
// (checks commented-out for now)
|
||||
// Note: checks use rs1_bkp and rs2_bkp because
|
||||
// FDIV and FSQRT overwrite rs1 and rs2
|
||||
//
|
||||
//isFDIV:
|
||||
// z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")");
|
||||
//isFSQRT:
|
||||
// z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")");
|
||||
|
||||
|
||||
isFMADD :
|
||||
z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFMSUB :
|
||||
z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFNMSUB:
|
||||
z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFNMADD:
|
||||
z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
|
||||
|
||||
isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")");
|
||||
isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")");
|
||||
|
||||
isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")");
|
||||
isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")");
|
||||
|
||||
isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")");
|
||||
isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")");
|
||||
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
/***************************************************************************/
|
||||
// The ALU. Does operations and tests combinatorially, except DIV
|
||||
/***************************************************************************/
|
||||
|
||||
// First ALU source, always rs1
|
||||
wire [31:0] aluIn1 = rs1;
|
||||
|
||||
// Second ALU source, depends on opcode:
|
||||
// ALUreg, Branch: rs2
|
||||
// ALUimm, Load, JALR: Iimm
|
||||
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
|
||||
|
||||
wire aluWr; // ALU write strobe
|
||||
|
||||
// The adder is used by both arithmetic instructions and JALR.
|
||||
wire [31:0] aluPlus = aluIn1 + aluIn2;
|
||||
|
||||
// Use a single 33 bits subtract to do subtraction and all comparisons
|
||||
// (trick borrowed from swapforth/J1)
|
||||
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
|
||||
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
|
||||
wire LTU = aluMinus[32];
|
||||
wire EQ = (aluMinus[31:0] == 0);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Use the same shifter both for left and right shifts by
|
||||
// applying bit reversal
|
||||
|
||||
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
|
||||
|
||||
/* verilator lint_off WIDTH */
|
||||
wire [31:0] shifter =
|
||||
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
|
||||
/* verilator lint_on WIDTH */
|
||||
|
||||
wire [31:0] leftshift = flip32(shifter);
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// funct3: 1->MULH, 2->MULHSU 3->MULHU
|
||||
wire isMULH = funct3Is[1];
|
||||
wire isMULHSU = funct3Is[2];
|
||||
|
||||
wire sign1 = aluIn1[31] & isMULH;
|
||||
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
|
||||
|
||||
wire signed [32:0] signed1 = {sign1, aluIn1};
|
||||
wire signed [32:0] signed2 = {sign2, aluIn2};
|
||||
wire signed [63:0] multiply = signed1 * signed2;
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
// Notes:
|
||||
// - instr[30] is 1 for SUB and 0 for ADD
|
||||
// - for SUB, need to test also instr[5] to discriminate ADDI:
|
||||
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
|
||||
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
|
||||
|
||||
wire [31:0] alu_base =
|
||||
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
|
||||
(funct3Is[1] ? leftshift : 32'b0) |
|
||||
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
|
||||
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
|
||||
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
|
||||
(funct3Is[5] ? shifter : 32'b0) |
|
||||
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
|
||||
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
|
||||
|
||||
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
|
||||
// 4->DIV 5->DIVU 6->REM 7->REMU
|
||||
|
||||
wire [31:0] alu_mul = funct3Is[0]
|
||||
? multiply[31: 0] // 0:MUL
|
||||
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
|
||||
|
||||
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
|
||||
: (div_sign ? -quotient : quotient);
|
||||
|
||||
|
||||
wire aluBusy = |quotient_msk; // ALU is busy if division in progress.
|
||||
reg [31:0] aluOut;
|
||||
|
||||
wire funcM = instr[25];
|
||||
wire isDivide = instr[14];
|
||||
|
||||
always @(posedge clk) begin
|
||||
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
|
||||
|
||||
reg div_sign;
|
||||
|
||||
reg [31:0] dividend;
|
||||
reg [62:0] divisor;
|
||||
reg [31:0] quotient;
|
||||
reg [32:0] quotient_msk;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (aluWr) begin
|
||||
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
|
||||
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
|
||||
quotient <= 0;
|
||||
quotient_msk[32] <= isALUreg & funcM & isDivide;
|
||||
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
|
||||
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
|
||||
end else begin
|
||||
divisor <= divisor >> 1;
|
||||
quotient_msk <= quotient_msk >> 1;
|
||||
if(divisor <= {31'b0, dividend}) begin
|
||||
quotient <= {quotient[30:0],1'b1};
|
||||
dividend <= dividend - divisor[31:0];
|
||||
end else begin
|
||||
quotient <= {quotient[30:0],1'b0};
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// The predicate for conditional branches.
|
||||
/***************************************************************************/
|
||||
|
||||
wire predicate_ =
|
||||
funct3Is[0] & EQ | // BEQ
|
||||
funct3Is[1] & !EQ | // BNE
|
||||
funct3Is[4] & LT | // BLT
|
||||
funct3Is[5] & !LT | // BGE
|
||||
funct3Is[6] & LTU | // BLTU
|
||||
funct3Is[7] & !LTU ; // BGEU
|
||||
|
||||
reg predicate;
|
||||
|
||||
/***************************************************************************/
|
||||
// Program counter and branch target computation.
|
||||
/***************************************************************************/
|
||||
|
||||
reg [ADDR_WIDTH-1:0] PC; // The program counter.
|
||||
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
|
||||
// ignored (not used in RV32I base instr set).
|
||||
|
||||
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
|
||||
|
||||
// An adder used to compute branch address, JAL address and AUIPC.
|
||||
reg [ADDR_WIDTH-1:0] PCplusImm;
|
||||
|
||||
// A separate adder to compute the destination of load/store.
|
||||
reg [ADDR_WIDTH-1:0] loadstore_addr;
|
||||
|
||||
assign mem_addr = {ADDR_PAD,
|
||||
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
|
||||
PC : loadstore_addr
|
||||
};
|
||||
|
||||
/***************************************************************************/
|
||||
// The value written back to the register file.
|
||||
/***************************************************************************/
|
||||
|
||||
wire [31:0] writeBackData =
|
||||
/* verilator lint_off WIDTH */
|
||||
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
|
||||
/* verilator lint_on WIDTH */
|
||||
(isLUI ? Uimm : 32'b0) | // LUI
|
||||
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
|
||||
(isFPU ? fpuOut : 32'b0) | // FPU
|
||||
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
|
||||
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
|
||||
(isLoad ? LOAD_data : 32'b0); // Load
|
||||
|
||||
/***************************************************************************/
|
||||
// LOAD/STORE
|
||||
/***************************************************************************/
|
||||
|
||||
// All memory accesses are aligned on 32 bits boundary. For this
|
||||
// reason, we need some circuitry that does unaligned halfword
|
||||
// and byte load/store, based on:
|
||||
// - funct3[1:0]: 00->byte 01->halfword 10->word (=instr[13:12])
|
||||
// - mem_addr[1:0]: indicates which byte/halfword is accessed
|
||||
// - instr[2] is set for FLW and FSW.
|
||||
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
|
||||
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
|
||||
|
||||
// LOAD, in addition to funct3[1:0], LOAD depends on:
|
||||
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
|
||||
|
||||
wire LOAD_sign =
|
||||
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
|
||||
|
||||
wire [31:0] LOAD_data =
|
||||
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
|
||||
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
|
||||
mem_rdata ;
|
||||
|
||||
wire [15:0] LOAD_halfword =
|
||||
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
|
||||
|
||||
wire [7:0] LOAD_byte =
|
||||
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
|
||||
|
||||
// STORE
|
||||
|
||||
assign mem_wdata[ 7: 0] = rs2[7:0];
|
||||
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
|
||||
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
|
||||
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
|
||||
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
|
||||
|
||||
// The memory write mask:
|
||||
// 1111 if writing a word
|
||||
// 0011 or 1100 if writing a halfword
|
||||
// (depending on loadstore_addr[1])
|
||||
// 0001, 0010, 0100 or 1000 if writing a byte
|
||||
// (depending on loadstore_addr[1:0])
|
||||
|
||||
wire [3:0] STORE_wmask =
|
||||
mem_byteAccess ?
|
||||
(loadstore_addr[1] ?
|
||||
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
|
||||
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
|
||||
) :
|
||||
mem_halfwordAccess ?
|
||||
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
|
||||
4'b1111;
|
||||
|
||||
/*************************************************************************/
|
||||
// And, last but not least, the state machine.
|
||||
/*************************************************************************/
|
||||
|
||||
localparam FETCH_INSTR_bit = 0;
|
||||
localparam WAIT_INSTR_bit = 1;
|
||||
localparam EXECUTE1_bit = 2;
|
||||
localparam EXECUTE2_bit = 3;
|
||||
localparam WAIT_ALU_OR_MEM_bit = 4;
|
||||
localparam NB_STATES = 5;
|
||||
|
||||
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
|
||||
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
|
||||
localparam EXECUTE1 = 1 << EXECUTE1_bit;
|
||||
localparam EXECUTE2 = 1 << EXECUTE2_bit;
|
||||
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
|
||||
|
||||
(* onehot *)
|
||||
reg [NB_STATES-1:0] state;
|
||||
|
||||
// The signals (internal and external) that are determined
|
||||
// combinatorially from state and other signals.
|
||||
|
||||
// The memory-read signal.
|
||||
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
|
||||
|
||||
// The mask for memory-write.
|
||||
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
|
||||
|
||||
// aluWr starts computation (shifts) in the ALU.
|
||||
assign aluWr = state[EXECUTE1_bit] & isALU;
|
||||
|
||||
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
|
||||
`ifdef NRV_IS_IO_ADDR
|
||||
wire needToWait = isLoad |
|
||||
isStore & `NRV_IS_IO_ADDR(mem_addr) |
|
||||
aluBusy | isFPU;
|
||||
`else
|
||||
wire needToWait = isLoad | isStore | aluBusy | isFPU;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(!reset) begin
|
||||
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
|
||||
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
|
||||
end else
|
||||
|
||||
// See note [1] at the end of this file.
|
||||
(* parallel_case *)
|
||||
case(1'b1)
|
||||
|
||||
state[WAIT_INSTR_bit]: begin
|
||||
if(!mem_rbusy) begin // may be high when executing from SPI flash
|
||||
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored
|
||||
state <= EXECUTE1; // also the declaration of instr).
|
||||
end
|
||||
end
|
||||
|
||||
state[EXECUTE1_bit]: begin
|
||||
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
|
||||
// Equivalent to:
|
||||
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
|
||||
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
|
||||
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
|
||||
Bimm[ADDR_WIDTH-1:0] );
|
||||
|
||||
// testing instr[5] is equivalent to testing isStore in this context.
|
||||
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
|
||||
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
|
||||
|
||||
predicate <= predicate_;
|
||||
state <= EXECUTE2;
|
||||
end
|
||||
|
||||
state[EXECUTE2_bit]: begin
|
||||
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
|
||||
jumpToPCplusImm ? PCplusImm :
|
||||
PCplus4;
|
||||
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
|
||||
end
|
||||
|
||||
state[WAIT_ALU_OR_MEM_bit]: begin
|
||||
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
|
||||
state <= FETCH_INSTR;
|
||||
end
|
||||
end
|
||||
|
||||
default: begin // FETCH_INSTR
|
||||
state <= WAIT_INSTR;
|
||||
end
|
||||
|
||||
endcase
|
||||
end
|
||||
|
||||
/***************************************************************************/
|
||||
// Cycle counter
|
||||
/***************************************************************************/
|
||||
|
||||
`ifdef NRV_COUNTER_WIDTH
|
||||
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
|
||||
`else
|
||||
reg [31:0] cycles;
|
||||
`endif
|
||||
always @(posedge clk) cycles <= cycles + 1;
|
||||
|
||||
endmodule
|
||||
|
||||
/*****************************************************************************/
|
||||
Reference in New Issue
Block a user