/******************************************************************************/ // Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz // TestDrive: morphing tachyon into a RV32IMF core, trying to // preserve maxfreq at each step. // Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz // Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz // Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz // Step 3: RV32F decod only valid. fmax: 100-105 MHz exp. fmax: 105 MHz // /******************************************************************************/ // Firmware generation flags for this processor `define NRV_ARCH "rv32imaf" `define NRV_ABI "ilp32f" //`define NRV_ARCH "rv32im" //`define NRV_ABI "ilp32" `define NRV_OPTIMIZE "-O3" // Check condition and display message in simulation `ifdef BENCH `define ASSERT(cond,msg) if(!(cond)) $display msg `define ASSERT_NOT_REACHED(msg) $display msg `else `define ASSERT(cond,msg) `define ASSERT_NOT_REACHED(msg) `endif // FPU Normalization needs to detect the position of the first bit set // in the A_frac register. It is easier to count the number of leading // zeroes (CLZ for Count Leading Zeroes), as follows. See: // https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count module CLZ #( parameter W_IN = 64, // must be power of 2, >= 2 parameter W_OUT = $clog2(W_IN) ) ( input wire [W_IN-1:0] in, output wire [W_OUT-1:0] out ); generate if(W_IN == 2) begin assign out = !in[1]; end else begin wire [W_OUT-2:0] half_count; wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2]; wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2]; wire left_empty = ~|lhs; CLZ #( .W_IN(W_IN/2) ) inner( .in(left_empty ? rhs : lhs), .out(half_count) ); assign out = {left_empty, half_count}; end endgenerate endmodule module FemtoRV32( input clk, output [31:0] mem_addr, // address bus output [31:0] mem_wdata, // data to be written output [3:0] mem_wmask, // write mask for the 4 bytes of each word input [31:0] mem_rdata, // input lines for both data and instr output mem_rstrb, // active to initiate memory read (used by IO) input mem_rbusy, // asserted if memory is busy reading value input mem_wbusy, // asserted if memory is busy writing value input reset // set to 0 to reset the processor ); parameter RESET_ADDR = 32'h00000000; parameter ADDR_WIDTH = 24; localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs // Flip a 32 bit word. Used by the shifter (a single shifter for // left and right shifts, saves silicium !) function [31:0] flip32; input [31:0] x; flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7], x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23], x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]}; endfunction /***************************************************************************/ // Instruction decoding. /***************************************************************************/ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction. // Reference: Table page 104 of: // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf // The ALU function, decoded in 1-hot form (doing so reduces LUT count) // It is used as follows: funct3Is[val] <=> funct3 == val (* onehot *) reg [7:0] funct3Is; // Instruction decoder and immediate decoder // Base RISC-V (RV32I) has only 10 different instructions ! reg isLoad, isALUimm, isAUIPC, isStore, isALUreg, isLUI, isBranch, isJALR, isJAL, isSYSTEM, isFPU; reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm; reg rdIsNZ; // Asserted if dest. register is non-zero (writeback) always @(posedge clk) begin if(state[WAIT_INSTR_bit]) begin isLoad <= (mem_rdata[6:3] == 4'b0000); // rd <- mem[rs1+Iimm] isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm isStore <= (mem_rdata[6:3] == 4'b0100); // mem[rs1+Simm] <- rs2 isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2 isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles isFPU <= (mem_rdata[6:5] == 2'b10); // all FPU except FLW/FSW funct3Is <= 8'b00000001 << mem_rdata[14:12]; Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}}; Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]}; Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]}; Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0}; Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0}; rdIsNZ <= |mem_rdata[11:7]; end end wire isALU = isALUimm | isALUreg; /***************************************************************************/ // The register file. /***************************************************************************/ reg [31:0] rs1; reg [31:0] rs2; reg [31:0] rs3; // this one is used by the FMA instructions. reg [31:0] registerFile [0:63]; // 0..31: integer registers // 32..63: floating-point registers /***************************************************************************/ // The FPU /***************************************************************************/ // instruction decoder reg isFMADD, isFMSUB, isFNMSUB, isFNMADD, isFADD, isFSUB, isFMUL, isFDIV, isFSQRT, isFSGNJ, isFSGNJN, isFSGNJX, isFMIN, isFMAX, isFEQ, isFLT, isFLE, isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW, isFMVWX; reg rdIsFP; // Asserted if destination register is a FP register. // rs1 is a FP register if instr[6:5] = 2'b10 except for: // FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101 // FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111 // (two versions of the signal, one for regular instruction decode, // the other one for compressed instructions). wire rs1IsFP = (mem_rdata[6:5] == 2'b10 ) && !((mem_rdata[4:2] == 3'b100) && ( (mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U} (mem_rdata[31:28] == 4'b1111) // FMV.W.X ) ); // rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW // (two versions of the signal, one for regular instruction decode, // the other one for compressed instructions). wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001); always @(posedge clk) begin if(state[WAIT_INSTR_bit]) begin isFMADD <= (mem_rdata[4:2] == 3'b000); isFMSUB <= (mem_rdata[4:2] == 3'b001); isFNMSUB <= (mem_rdata[4:2] == 3'b010); isFNMADD <= (mem_rdata[4:2] == 3'b011); isFADD <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000)); isFSUB <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001)); isFMUL <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010)); isFDIV <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011)); isFSQRT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011)); isFSGNJ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00)); isFSGNJN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01)); isFSGNJX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10)); isFMIN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]); isFMAX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && mem_rdata[12]); isFEQ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10)); isFLT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01)); isFLE <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00)); isFCLASS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && mem_rdata[12]); isFCVTWS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]); isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && mem_rdata[20]); isFCVTSW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]); isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && mem_rdata[20]); isFMVXW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]); isFMVWX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110)); rdIsFP <= (mem_rdata[6:2] == 5'b00001) || // FLW (mem_rdata[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB (mem_rdata[6:4] == 3'b101 && ( (mem_rdata[31] == 1'b0) || // R-Type FPU (mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U} (mem_rdata[31:28] == 4'b1111) // FMV.W.X ) ); end end // FPU output = 32 MSBs of A register (see below) // A macro to easily write to it (`FPU_OUT <= ...), // used when FPU output is an integer. `define FPU_OUT {A_sign, A_exp[7:0], A_frac[46:24]} wire [31:0] fpuOut = `FPU_OUT; // Two temporary 32-bit registers used by FDIV and FSQRT reg [31:0] tmp1; reg [31:0] tmp2; // Expand the source registers into sign, exponent and fraction. // Normalized, first bit set is bit 23 (addditional bit), or zero. // For now, flush all denormals to zero // TODO: denormals and infinities // Following IEEE754, represented number is +/- frac * 2^(exp-127-23) // (127: bias 23: position of first bit set for normalized numbers) wire rs1_sign = rs1[31]; wire [7:0] rs1_exp = rs1[30:23]; wire [23:0] rs1_frac = rs1_exp == 8'd0 ? 24'b0 : {1'b1, rs1[22:0]}; wire rs2_sign = rs2[31]; wire [7:0] rs2_exp = rs2[30:23]; wire [23:0] rs2_frac = rs2_exp == 8'd0 ? 24'b0 : {1'b1, rs2[22:0]}; wire rs3_sign = rs3[31]; wire [7:0] rs3_exp = rs3[30:23]; wire [23:0] rs3_frac = rs3_exp == 8'd0 ? 24'b0 : {1'b1, rs3[22:0]}; // Two high-resolution registers // Register A has the accumulator / shifters / leading zero counter // Normalized if first bit set is bit 47 // Represented number is +/- frac * 2^(exp-127-47) reg A_sign; reg signed [8:0] A_exp; reg signed [49:0] A_frac; reg B_sign; reg signed [8:0] B_exp; reg signed [49:0] B_frac; // ******************* Comparisons ****************************************** // Exponent adder wire signed [8:0] exp_sum = B_exp + A_exp; wire signed [8:0] exp_diff = B_exp - A_exp; wire expA_EQ_expB = (exp_diff == 0); wire fracA_EQ_fracB = (frac_diff == 0); wire fabsA_EQ_fabsB = (expA_EQ_expB && fracA_EQ_fracB); wire fabsA_LT_fabsB = (!exp_diff[8] && !expA_EQ_expB) || (expA_EQ_expB && !fracA_EQ_fracB && !frac_diff[50]); wire fabsA_LE_fabsB = (!exp_diff[8] && !expA_EQ_expB) || (expA_EQ_expB && !frac_diff[50]); wire fabsB_LT_fabsA = exp_diff[8] || (expA_EQ_expB && frac_diff[50]); wire fabsB_LE_fabsA = exp_diff[8] || (expA_EQ_expB && (frac_diff[50] || fracA_EQ_fracB)); wire A_LT_B = A_sign && !B_sign || A_sign && B_sign && fabsB_LT_fabsA || !A_sign && !B_sign && fabsA_LT_fabsB ; wire A_LE_B = A_sign && !B_sign || A_sign && B_sign && fabsB_LE_fabsA || !A_sign && !B_sign && fabsA_LE_fabsB ; wire A_EQ_B = fabsA_EQ_fabsB && (A_sign == B_sign); // ****************** Addition, subtraction ********************************* wire signed [50:0] frac_sum = B_frac + A_frac; wire signed [50:0] frac_diff = B_frac - A_frac; // ****************** Product *********************************************** wire [49:0] prod_frac = rs1_frac * rs2_frac; // TODO: check overflows // exponent of product, once normalized // (obtained by writing expression of product and inspecting exponent) // Two cases: first bit set = 47 or 46 (only possible cases with normals) wire signed [8:0] prod_exp_norm = rs1_exp+rs2_exp-127+{7'b0,prod_frac[47]}; // detect null product and underflows (all denormals are flushed to zero) wire prod_Z = (prod_exp_norm <= 0) || !(|prod_frac[47:46]); // ****************** Normalization ***************************************** // Count leading zeroes in A // Note1: CLZ only work with power of two width (hence 14'b0). // Note2: first bit set = 63 - CLZ (of course !) wire [5:0] A_clz; CLZ clz({14'b0,A_frac}, A_clz); // Exponent of A once normalized = A_exp + first_bit_set - 47 // = A_exp + 63 - clz - 47 = A_exp + 16 - clz wire signed [8:0] A_exp_norm = A_exp + 16 - {3'b000,A_clz}; // ****************** Reciprocal (1/x), used by FDIV ************************ // Exponent for reciprocal (1/x) // Initial value of x kept in tmp2. wire signed [8:0] frcp_exp = 9'd126 + A_exp - $signed({1'b0, tmp2[30:23]}); // ****************** Reciprocal square root (1/sqrt(x)) ******************** // https://en.wikipedia.org/wiki/Fast_inverse_square_root wire [31:0] rsqrt_doom_magic = 32'h5f3759df - {1'b0,rs1[30:1]}; // ****************** Float to Integer conversion *************************** // -127-23 is standard exponent bias // -6 because it is bit 29 of rs1 that corresponds to bit 47 of A_frac, // instead of bit 23 (and 23-29 = -6). wire signed [8:0] fcvt_ftoi_shift = rs1_exp - 9'd127 - 9'd23 - 9'd6; wire signed [8:0] neg_fcvt_ftoi_shift = -fcvt_ftoi_shift; wire [31:0] A_fcvt_ftoi_shifted = fcvt_ftoi_shift[8] ? // R or L shift (|neg_fcvt_ftoi_shift[8:5] ? 0 : // underflow ({A_frac[49:18]} >> neg_fcvt_ftoi_shift[4:0])) : ({A_frac[49:18]} << fcvt_ftoi_shift[4:0]); // ******************* Classification *************************************** wire rs1_exp_Z = (rs1_exp == 0 ); wire rs1_exp_255 = (rs1_exp == 255); wire rs1_frac_Z = (rs1_frac == 0 ); wire [31:0] fclass = { 22'b0, rs1_exp_255 & rs1_frac[22], // 9: quiet NaN rs1_exp_255 & !rs1_frac[22] & (|rs1_frac[21:0]), // 8: sig NaN !rs1_sign & rs1_exp_255 & rs1_frac_Z, // 7: +infinity !rs1_sign & !rs1_exp_Z & !rs1_exp_255, // 6: +normal !rs1_sign & rs1_exp_Z & !rs1_frac_Z, // 5: +subnormal !rs1_sign & rs1_exp_Z & rs1_frac_Z, // 4: +0 rs1_sign & rs1_exp_Z & rs1_frac_Z, // 3: -0 rs1_sign & rs1_exp_Z & !rs1_frac_Z, // 2: -subnormal rs1_sign & !rs1_exp_Z & !rs1_exp_255, // 1: -normal rs1_sign & rs1_exp_255 & rs1_frac_Z // 0: -infinity }; /** FPU micro-instructions *************************************************/ localparam FPMI_READY = 0; localparam FPMI_LOAD_AB = 1; // A <- fprs1; B <- fprs2 localparam FPMI_LOAD_AB_MUL = 2; // A <- norm(fprs1*fprs2); B <- fprs3 localparam FPMI_NORM = 3; // A <- norm(A) localparam FPMI_ADD_SWAP = 4; // if |A| > |B| swap(A,B) localparam FPMI_ADD_SHIFT = 5; // shift A to match B exponent localparam FPMI_ADD_ADD = 6; // A <- A + B (or A - B if FSUB) localparam FPMI_CMP = 7; // fpuOut <- test A,B (FEQ,FLE,FLT) localparam FPMI_MV_RS1_A = 8; // fprs1 <- A localparam FPMI_MV_RS2_TMP1 = 9; // fprs1 <- tmp1 localparam FPMI_MV_RS2_MHTMP1 = 10; // fprs2 <- -0.5*tmp1 localparam FPMI_MV_RS2_TMP2 = 11; // fprs2 <- tmp2 localparam FPMI_MV_TMP2_A = 12; // tmp2 <- A localparam FPMI_FRCP_PROLOG = 13; // init reciprocal (1/x) localparam FPMI_FRCP_ITER = 14; // iteration for reciprocal localparam FPMI_FRCP_EPILOG = 15; // epilog for reciprocal localparam FPMI_FRSQRT_PROLOG = 16; // init recipr sqr root (1/sqrt(x)) localparam FPMI_FP_TO_INT = 17; // fpuOut <- fpoint_to_int(fprs1) localparam FPMI_INT_TO_FP = 18; // A <- int_to_fpoint(rs1) localparam FPMI_MIN_MAX = 19; // fpuOut <- min/max(A,B) localparam FPMI_NB = 20; // Instruction exit flag (if set in current micro-instr, exit microprogram) localparam FPMI_EXIT_FLAG_bit = 1+$clog2(FPMI_NB); localparam FPMI_EXIT_FLAG = 1 << FPMI_EXIT_FLAG_bit; reg [6:0] fpmi_PC; // current micro-instruction pointer reg [1+$clog2(FPMI_NB):0] fpmi_instr; // current micro-instruction // current micro-instruction as 1-hot: fpmi_instr == NNN <=> fpmi_is[NNN] (* onehot *) wire [FPMI_NB-1:0] fpmi_is = 1 << fpmi_instr[$clog2(FPMI_NB):0]; initial fpmi_PC = 0; wire fpuBusy = !fpmi_is[FPMI_READY]; // micro-program ROM (wired // as a combinatorial function). always @(*) begin case(fpmi_PC) 0: fpmi_instr = FPMI_READY; // FLT, FLE, FEQ 1: fpmi_instr = FPMI_LOAD_AB; 2: fpmi_instr = FPMI_CMP | FPMI_EXIT_FLAG; // FADD, FSUB 3: fpmi_instr = FPMI_LOAD_AB; // A <- fprs1, B <- fprs2 4: fpmi_instr = FPMI_ADD_SWAP; // if(|A| > |B|) swap(A,B) 5: fpmi_instr = FPMI_ADD_SHIFT; // shift A according to B exp 6: fpmi_instr = FPMI_ADD_ADD; // A <- A + B ( or A - B if FSUB) 7: fpmi_instr = FPMI_NORM | // A <- normalize(A) FPMI_EXIT_FLAG; // FMUL 8: fpmi_instr = FPMI_LOAD_AB_MUL | // A <- normalize(fprs1*fprs2) FPMI_EXIT_FLAG; // FMADD, FMSUB, FNMADD, FNMSUB 9: fpmi_instr = FPMI_LOAD_AB_MUL; // A <- norm(fprs1*fprs2), B <- fprs3 10: fpmi_instr = FPMI_ADD_SWAP; // if(|A| > |B|) swap(A,B) 11: fpmi_instr = FPMI_ADD_SHIFT; // shift A according to B exp 12: fpmi_instr = FPMI_ADD_ADD; // A <- A + B ( or A - B if FSUB) 13: fpmi_instr = FPMI_NORM | // A <- normalize(A) FPMI_EXIT_FLAG; // FDIV // using Newton-Raphson: // https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division // STEP 1 : D' <- fprs2 normalized between [0.5,1] (set exp to 126) // A <- -D'*32/17 + 48/17 // STEP 2,3: A <- A * (-A*D+2) (two iterations) // STEP 4 : A <- fprs1 * A 14: fpmi_instr = FPMI_FRCP_PROLOG; // STEP 1: A <- -D'*32/17 + 48/17 15: fpmi_instr = FPMI_LOAD_AB_MUL; // --- 16: fpmi_instr = FPMI_ADD_SWAP; // | 17: fpmi_instr = FPMI_ADD_SHIFT; // FMADD 18: fpmi_instr = FPMI_ADD_ADD; // | 19: fpmi_instr = FPMI_NORM; // --- 20: fpmi_instr = FPMI_FRCP_ITER; // STEP 2: A <- A * (-A*D + 2) 21: fpmi_instr = FPMI_LOAD_AB_MUL; // --- 22: fpmi_instr = FPMI_ADD_SWAP; // | 23: fpmi_instr = FPMI_ADD_SHIFT; // FMADD 24: fpmi_instr = FPMI_ADD_ADD; // | 25: fpmi_instr = FPMI_NORM; // --- 26: fpmi_instr = FPMI_MV_RS1_A; // 27: fpmi_instr = FPMI_LOAD_AB_MUL; // FMUL 28: fpmi_instr = FPMI_FRCP_ITER; // STEP 3: A <- A * (-A*D + 2) 29: fpmi_instr = FPMI_LOAD_AB_MUL; // --- 30: fpmi_instr = FPMI_ADD_SWAP; // | 31: fpmi_instr = FPMI_ADD_SHIFT; // FMADD 32: fpmi_instr = FPMI_ADD_ADD; // | 33: fpmi_instr = FPMI_NORM; // --- 34: fpmi_instr = FPMI_MV_RS1_A; // 35: fpmi_instr = FPMI_LOAD_AB_MUL; // FMUL 36: fpmi_instr = FPMI_FRCP_EPILOG; // STEP 4: A <- fprs1^(-1) * fprs2 37: fpmi_instr = FPMI_LOAD_AB_MUL | // FMUL FPMI_EXIT_FLAG; // FCVT.W.S, FCVT.WU.S 38: fpmi_instr = FPMI_LOAD_AB; 39: fpmi_instr = FPMI_FP_TO_INT | FPMI_EXIT_FLAG; // FCVT.S.W, FCVT.S.WU 40: fpmi_instr = FPMI_INT_TO_FP; 41: fpmi_instr = FPMI_NORM | FPMI_EXIT_FLAG; // FSQRT // Using Doom's fast inverse square root algorithm: // https://en.wikipedia.org/wiki/Fast_inverse_square_root // STEP 1 : A <- doom_magic - (A >> 1) // STEP 2,3: A <- A * (3/2 - (fprs1/2 * A * A)) 42: fpmi_instr = FPMI_FRSQRT_PROLOG; 43: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL 44: fpmi_instr = FPMI_MV_RS1_A; 45: fpmi_instr = FPMI_MV_RS2_MHTMP1; 46: fpmi_instr = FPMI_LOAD_AB_MUL; // --- 47: fpmi_instr = FPMI_ADD_SWAP; // | 48: fpmi_instr = FPMI_ADD_SHIFT; // FMADD 49: fpmi_instr = FPMI_ADD_ADD; // | 50: fpmi_instr = FPMI_NORM; // --- 51: fpmi_instr = FPMI_MV_RS1_A; 52: fpmi_instr = FPMI_MV_RS2_TMP2; 53: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL 54: fpmi_instr = FPMI_MV_TMP2_A; 55: fpmi_instr = FPMI_MV_RS1_A; 56: fpmi_instr = FPMI_MV_RS2_TMP2; 57: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL 58: fpmi_instr = FPMI_MV_RS1_A; 59: fpmi_instr = FPMI_MV_RS2_MHTMP1; 60: fpmi_instr = FPMI_LOAD_AB_MUL; // --- 61: fpmi_instr = FPMI_ADD_SWAP; // | 62: fpmi_instr = FPMI_ADD_SHIFT; // FMADD 63: fpmi_instr = FPMI_ADD_ADD; // | 64: fpmi_instr = FPMI_NORM; // --- 65: fpmi_instr = FPMI_MV_RS1_A; 66: fpmi_instr = FPMI_MV_RS2_TMP2; 67: fpmi_instr = FPMI_LOAD_AB_MUL; // -- FMUL 68: fpmi_instr = FPMI_MV_RS1_A; 69: fpmi_instr = FPMI_MV_RS2_TMP1; 70: fpmi_instr = FPMI_LOAD_AB_MUL | // -- FMUL FPMI_EXIT_FLAG; // FMIN, FMAX 71: fpmi_instr = FPMI_LOAD_AB; 72: fpmi_instr = FPMI_MIN_MAX | FPMI_EXIT_FLAG ; default: begin `ASSERT_NOT_REACHED(("Invalid microcode address: %d",fpmi_PC)); fpmi_instr = 7'bXXXXXXX; end endcase end // micro-programs localparam FPMPROG_CMP = 1; localparam FPMPROG_ADD = 3; localparam FPMPROG_MUL = 8; localparam FPMPROG_MADD = 9; localparam FPMPROG_DIV = 14; localparam FPMPROG_TO_INT = 38; localparam FPMPROG_INT_TO_FP = 40; localparam FPMPROG_SQRT = 42; localparam FPMPROG_MIN_MAX = 71; always @(posedge clk) begin if(state[WAIT_INSTR_bit]) begin // Fetch registers as soon as instruction is ready. rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}]; rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}]; rs3 <= registerFile[{1'b1, mem_rdata[31:27]}]; end else if(state[EXECUTE2_bit] & isFPU) begin // Execute single-cycle intructions and call micro-program // for micro-programmed ones. (* parallel_case *) case(1'b1) // Single-cycle instructions isFSGNJ : `FPU_OUT <= { rs2[31], rs1[30:0]}; isFSGNJN : `FPU_OUT <= { !rs2[31], rs1[30:0]}; isFSGNJX : `FPU_OUT <= { rs1[31]^rs2[31], rs1[30:0]}; isFCLASS : `FPU_OUT <= fclass; isFMVXW | isFMVWX : `FPU_OUT <= rs1; // Micro-programmed instructions isFLT | isFLE | isFEQ : fpmi_PC <= FPMPROG_CMP; isFADD | isFSUB : fpmi_PC <= FPMPROG_ADD; isFMUL : fpmi_PC <= FPMPROG_MUL; isFMADD | isFMSUB | isFNMADD | isFNMSUB : fpmi_PC <= FPMPROG_MADD; isFDIV : fpmi_PC <= FPMPROG_DIV; isFSQRT : fpmi_PC <= FPMPROG_SQRT; isFCVTWS | isFCVTWUS : fpmi_PC <= FPMPROG_TO_INT; isFCVTSW | isFCVTSWU : fpmi_PC <= FPMPROG_INT_TO_FP; isFMIN | isFMAX : fpmi_PC <= FPMPROG_MIN_MAX; endcase `ifdef VERILATORXXX (* parallel_case *) case(1'b1) isFMADD : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")"); isFMSUB : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")"); isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")"); isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")"); isFMUL : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")"); isFADD : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")"); isFSUB : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")"); isFDIV : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")"); isFSQRT : `FPU_OUT <= $c32("FSQRT(",rs1,")"); isFSGNJ : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")"); isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")"); isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")"); isFMIN : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")"); isFMAX : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")"); isFEQ : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")"); isFLE : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")"); isFLT : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")"); isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ; isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")"); isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")"); isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")"); isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")"); isFMVXW: `FPU_OUT <= rs1; isFMVWX: `FPU_OUT <= rs1; endcase `endif end else if(fpuBusy) begin // Increment micro-program counter. fpmi_PC <= fpmi_instr[FPMI_EXIT_FLAG_bit] ? 0 : fpmi_PC+1; // Implementation of the micro-instructions (* parallel_case *) case(1'b1) // A <- rs1 ; B <- rs2 fpmi_is[FPMI_LOAD_AB]: begin A_sign <= rs1_sign; A_frac <= {2'b0, rs1_frac, 24'd0}; A_exp <= {1'b0, rs1_exp}; B_sign <= rs2_sign ^ isFSUB; B_frac <= {2'b0, rs2_frac, 24'd0}; B_exp <= {1'b0, rs2_exp}; end // A <- (+/-) normalize(rs1*rs2); B <- (+/-)rs3 fpmi_is[FPMI_LOAD_AB_MUL]: begin A_sign <= rs1_sign ^ rs2_sign ^ (isFNMSUB | isFNMADD); A_frac <= prod_Z ? 0 : (prod_frac[47] ? prod_frac : {prod_frac[48:0],1'b0}); A_exp <= prod_Z ? 0 : prod_exp_norm; B_sign <= rs3_sign ^ (isFMSUB | isFNMADD); B_frac <= {2'b0, rs3_frac, 24'd0}; B_exp <= {1'b0, rs3_exp}; end // A <- normalize(A) fpmi_is[FPMI_NORM]: begin if(A_exp_norm <= 0 || (A_frac == 0)) begin A_frac <= 0; A_exp <= 0; end else begin // left shamt = 47 - first_bit_set = A_clz - 16 // (reminder: first_bit_set = 63 - A_clz) `ASSERT( 63 - A_clz <= 48, ("NORM: first bit set = %d\n",63-A_clz) ); A_frac <= A_frac[48] ? (A_frac >> 1) : A_frac << (A_clz - 16); A_exp <= A_exp_norm; end end // if(|A| > |B|) swap(A,B) fpmi_is[FPMI_ADD_SWAP]: begin if(fabsB_LT_fabsA) begin A_frac <= B_frac; B_frac <= A_frac; A_exp <= B_exp; B_exp <= A_exp; A_sign <= B_sign; B_sign <= A_sign; end end // shift A in order to make it match B exponent fpmi_is[FPMI_ADD_SHIFT]: begin `ASSERT(!fabsB_LT_fabsA, ("ADD_SHIFT: incorrect order")); A_frac <= (exp_diff > 47) ? 0 : (A_frac >> exp_diff[5:0]); A_exp <= B_exp; end // A <- A (+/-) B fpmi_is[FPMI_ADD_ADD]: begin A_frac <= (A_sign ^ B_sign) ? frac_diff[49:0] : frac_sum[49:0]; A_sign <= B_sign; end // A <- result of comparison between A and B fpmi_is[FPMI_CMP]: begin `FPU_OUT <= { 31'b0, isFLT && A_LT_B || isFLE && A_LE_B || isFEQ && A_EQ_B }; end fpmi_is[FPMI_MV_RS2_TMP1] : rs2 <= tmp1; fpmi_is[FPMI_MV_RS2_TMP2] : rs2 <= tmp2; fpmi_is[FPMI_MV_RS1_A] : rs1 <= {A_sign,A_exp[7:0],A_frac[46:24]}; fpmi_is[FPMI_MV_TMP2_A] : tmp2 <= {A_sign,A_exp[7:0],A_frac[46:24]}; // rs2 <= -|tmp1| / 2.0 fpmi_is[FPMI_MV_RS2_MHTMP1]:rs2<={1'b1,tmp1[30:23]-8'd1,tmp1[22:0]}; fpmi_is[FPMI_FRCP_PROLOG]: begin tmp1 <= rs1; tmp2 <= rs2; // rs1 <= -D', that is, -(fprs2 normalized in [0.5,1]) rs1 <= {1'b1, 8'd126, rs2_frac[22:0]}; rs2 <= 32'h3FF0F0F1; // 32/17 rs3 <= 32'h4034B4B5; // 48/17 end fpmi_is[FPMI_FRCP_ITER]: begin rs1 <= {1'b1, 8'd126, tmp2[22:0]}; // -D' rs2 <= {A_sign, A_exp[7:0], A_frac[46:24]}; // A rs3 <= 32'h40000000; // 2.0 end fpmi_is[FPMI_FRCP_EPILOG]: begin rs1 <= {tmp2[31], frcp_exp[7:0], A_frac[46:24]}; rs2 <= tmp1; end fpmi_is[FPMI_FRSQRT_PROLOG]: begin tmp1 <= rs1; tmp2 <= rsqrt_doom_magic; rs1 <= rsqrt_doom_magic; rs2 <= rsqrt_doom_magic; rs3 <= 32'h3fc00000; // 1.5 end fpmi_is[FPMI_FP_TO_INT]: begin // TODO: check overflow `FPU_OUT <= (isFCVTWUS | !A_sign) ? A_fcvt_ftoi_shifted : -$signed(A_fcvt_ftoi_shifted); end fpmi_is[FPMI_INT_TO_FP]: begin // TODO: rounding A_frac <= (isFCVTSWU | !rs1[31]) ? {rs1, 18'd0} : {-$signed(rs1), 18'd0}; A_sign <= isFCVTSW & rs1[31]; // 127+23: standard exponent bias // +6 because it is bit 29 of rs1 that overwrites // bit 47 of A_frac, instead of bit 23 (and 29-23 = 6). A_exp <= 127+23+6; end fpmi_is[FPMI_MIN_MAX]: begin `FPU_OUT <= (A_LT_B ^ isFMAX) ? {A_sign, A_exp[7:0], A_frac[46:24]} : {B_sign, B_exp[7:0], B_frac[46:24]}; end endcase // register write-back end else if( !(isBranch | isStore) & (rdIsFP | rdIsNZ) & (state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]) ) begin registerFile[{rdIsFP,instr[11:7]}] <= writeBackData; end end `ifdef VERILATOR // When doing simulations, compare the result of all operations with // what's computed on the host CPU. reg [31:0] z; reg [31:0] rs1_bkp; reg [31:0] rs2_bkp; reg [31:0] rs3_bkp; always @(posedge clk) begin // Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and // rs3 as temporaty registers, so we need to save them to be able // to recompute the operation on the host CPU. if(isFPU && state[EXECUTE2_bit]) begin rs1_bkp <= rs1; rs2_bkp <= rs2; rs3_bkp <= rs3; end if( isFPU && state[WAIT_ALU_OR_MEM_bit] && fpmi_PC == 0 ) begin case(1'b1) isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")"); isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")"); isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")"); // my FDIV and FSQRT are not IEEE754 compliant ! // (checks commented-out for now) // Note: checks use rs1_bkp and rs2_bkp because // FDIV and FSQRT overwrite rs1 and rs2 // //isFDIV: // z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")"); //isFSQRT: // z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")"); isFMADD : z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")"); isFMSUB : z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")"); isFNMSUB: z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")"); isFNMADD: z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")"); isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")"); isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")"); isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")"); isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")"); isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")"); isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")"); isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")"); isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")"); isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")"); endcase end end `endif /***************************************************************************/ // The ALU. Does operations and tests combinatorially, except DIV /***************************************************************************/ // First ALU source, always rs1 wire [31:0] aluIn1 = rs1; // Second ALU source, depends on opcode: // ALUreg, Branch: rs2 // ALUimm, Load, JALR: Iimm wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm; wire aluWr; // ALU write strobe // The adder is used by both arithmetic instructions and JALR. wire [31:0] aluPlus = aluIn1 + aluIn2; // Use a single 33 bits subtract to do subtraction and all comparisons // (trick borrowed from swapforth/J1) wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1; wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32]; wire LTU = aluMinus[32]; wire EQ = (aluMinus[31:0] == 0); /***************************************************************************/ // Use the same shifter both for left and right shifts by // applying bit reversal wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1; /* verilator lint_off WIDTH */ wire [31:0] shifter = $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0]; /* verilator lint_on WIDTH */ wire [31:0] leftshift = flip32(shifter); /***************************************************************************/ // funct3: 1->MULH, 2->MULHSU 3->MULHU wire isMULH = funct3Is[1]; wire isMULHSU = funct3Is[2]; wire sign1 = aluIn1[31] & isMULH; wire sign2 = aluIn2[31] & (isMULH | isMULHSU); wire signed [32:0] signed1 = {sign1, aluIn1}; wire signed [32:0] signed2 = {sign2, aluIn2}; wire signed [63:0] multiply = signed1 * signed2; /***************************************************************************/ // Notes: // - instr[30] is 1 for SUB and 0 for ADD // - for SUB, need to test also instr[5] to discriminate ADDI: // (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !) // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL wire [31:0] alu_base = (funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) | (funct3Is[1] ? leftshift : 32'b0) | (funct3Is[2] ? {31'b0, LT} : 32'b0) | (funct3Is[3] ? {31'b0, LTU} : 32'b0) | (funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) | (funct3Is[5] ? shifter : 32'b0) | (funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) | (funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ; // funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU // 4->DIV 5->DIVU 6->REM 7->REMU wire [31:0] alu_mul = funct3Is[0] ? multiply[31: 0] // 0:MUL : multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend) : (div_sign ? -quotient : quotient); wire aluBusy = |quotient_msk; // ALU is busy if division in progress. reg [31:0] aluOut; wire funcM = instr[25]; wire isDivide = instr[14]; always @(posedge clk) begin aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base; end /***************************************************************************/ // Implementation of DIV/REM instructions, highly inspired by PicoRV32 reg div_sign; reg [31:0] dividend; reg [62:0] divisor; reg [31:0] quotient; reg [32:0] quotient_msk; always @(posedge clk) begin if (aluWr) begin dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1; divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0}; quotient <= 0; quotient_msk[32] <= isALUreg & funcM & isDivide; div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] : (aluIn1[31] ^ aluIn2[31]) & |aluIn2); end else begin divisor <= divisor >> 1; quotient_msk <= quotient_msk >> 1; if(divisor <= {31'b0, dividend}) begin quotient <= {quotient[30:0],1'b1}; dividend <= dividend - divisor[31:0]; end else begin quotient <= {quotient[30:0],1'b0}; end end end /***************************************************************************/ // The predicate for conditional branches. /***************************************************************************/ wire predicate_ = funct3Is[0] & EQ | // BEQ funct3Is[1] & !EQ | // BNE funct3Is[4] & LT | // BLT funct3Is[5] & !LT | // BGE funct3Is[6] & LTU | // BLTU funct3Is[7] & !LTU ; // BGEU reg predicate; /***************************************************************************/ // Program counter and branch target computation. /***************************************************************************/ reg [ADDR_WIDTH-1:0] PC; // The program counter. reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are // ignored (not used in RV32I base instr set). wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4; // An adder used to compute branch address, JAL address and AUIPC. reg [ADDR_WIDTH-1:0] PCplusImm; // A separate adder to compute the destination of load/store. reg [ADDR_WIDTH-1:0] loadstore_addr; assign mem_addr = {ADDR_PAD, state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? PC : loadstore_addr }; /***************************************************************************/ // The value written back to the register file. /***************************************************************************/ wire [31:0] writeBackData = /* verilator lint_off WIDTH */ (isSYSTEM ? cycles : 32'b0) | // SYSTEM /* verilator lint_on WIDTH */ (isLUI ? Uimm : 32'b0) | // LUI (isALU ? aluOut : 32'b0) | // ALUreg, ALUimm (isFPU ? fpuOut : 32'b0) | // FPU (isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC (isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR (isLoad ? LOAD_data : 32'b0); // Load /***************************************************************************/ // LOAD/STORE /***************************************************************************/ // All memory accesses are aligned on 32 bits boundary. For this // reason, we need some circuitry that does unaligned halfword // and byte load/store, based on: // - funct3[1:0]: 00->byte 01->halfword 10->word (=instr[13:12]) // - mem_addr[1:0]: indicates which byte/halfword is accessed // - instr[2] is set for FLW and FSW. wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00); wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01); // LOAD, in addition to funct3[1:0], LOAD depends on: // - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion wire LOAD_sign = !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]); wire [31:0] LOAD_data = mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} : mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} : mem_rdata ; wire [15:0] LOAD_halfword = loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0]; wire [7:0] LOAD_byte = loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0]; // STORE assign mem_wdata[ 7: 0] = rs2[7:0]; assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8]; assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16]; assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] : loadstore_addr[1] ? rs2[15:8] : rs2[31:24]; // The memory write mask: // 1111 if writing a word // 0011 or 1100 if writing a halfword // (depending on loadstore_addr[1]) // 0001, 0010, 0100 or 1000 if writing a byte // (depending on loadstore_addr[1:0]) wire [3:0] STORE_wmask = mem_byteAccess ? (loadstore_addr[1] ? (loadstore_addr[0] ? 4'b1000 : 4'b0100) : (loadstore_addr[0] ? 4'b0010 : 4'b0001) ) : mem_halfwordAccess ? (loadstore_addr[1] ? 4'b1100 : 4'b0011) : 4'b1111; /*************************************************************************/ // And, last but not least, the state machine. /*************************************************************************/ localparam FETCH_INSTR_bit = 0; localparam WAIT_INSTR_bit = 1; localparam EXECUTE1_bit = 2; localparam EXECUTE2_bit = 3; localparam WAIT_ALU_OR_MEM_bit = 4; localparam NB_STATES = 5; localparam FETCH_INSTR = 1 << FETCH_INSTR_bit; localparam WAIT_INSTR = 1 << WAIT_INSTR_bit; localparam EXECUTE1 = 1 << EXECUTE1_bit; localparam EXECUTE2 = 1 << EXECUTE2_bit; localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit; (* onehot *) reg [NB_STATES-1:0] state; // The signals (internal and external) that are determined // combinatorially from state and other signals. // The memory-read signal. assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit]; // The mask for memory-write. assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask; // aluWr starts computation (shifts) in the ALU. assign aluWr = state[EXECUTE1_bit] & isALU; wire jumpToPCplusImm = isJAL | (isBranch & predicate); `ifdef NRV_IS_IO_ADDR wire needToWait = isLoad | isStore & `NRV_IS_IO_ADDR(mem_addr) | aluBusy | isFPU; `else wire needToWait = isLoad | isStore | aluBusy | isFPU; `endif always @(posedge clk) begin if(!reset) begin state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy PC <= RESET_ADDR[ADDR_WIDTH-1:0]; end else // See note [1] at the end of this file. (* parallel_case *) case(1'b1) state[WAIT_INSTR_bit]: begin if(!mem_rbusy) begin // may be high when executing from SPI flash instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored state <= EXECUTE1; // also the declaration of instr). end end state[EXECUTE1_bit]: begin // branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm // Equivalent to: // PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm) PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] : instr[4] ? Uimm[ADDR_WIDTH-1:0] : Bimm[ADDR_WIDTH-1:0] ); // testing instr[5] is equivalent to testing isStore in this context. loadstore_addr <= rs1[ADDR_WIDTH-1:0] + (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]); predicate <= predicate_; state <= EXECUTE2; end state[EXECUTE2_bit]: begin PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} : jumpToPCplusImm ? PCplusImm : PCplus4; state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR; end state[WAIT_ALU_OR_MEM_bit]: begin if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin state <= FETCH_INSTR; end end default: begin // FETCH_INSTR state <= WAIT_INSTR; end endcase end /***************************************************************************/ // Cycle counter /***************************************************************************/ `ifdef NRV_COUNTER_WIDTH reg [`NRV_COUNTER_WIDTH-1:0] cycles; `else reg [31:0] cycles; `endif always @(posedge clk) cycles <= cycles + 1; endmodule /*****************************************************************************/