// femtorv32, a minimalistic RISC-V RV32I core // (minus SYSTEM and FENCE that are not implemented) // Bruno Levy, May-June 2020 // // drop-in replacement of femtorv32, // does 3 CPIs (cycles per instructions) in linear execution flow // (two be compared with 2 CPIs with femtorv32.v), // saves 20-50 LUTs // in femtosoc.v, replace `include "femtorv32.v" // with `include "mini_femtorv32.v" // // NOTE: the structure of the decoder has changed, *** NEEDS TO BE ADAPTED *** /*******************************************************************/ `include "utils.v" // Utilities, macros for debugging `include "register_file.v" // The 31 general-purpose registers `include "small_alu.v" // Used on IceStick, RV32I `include "large_alu.v" // For larger FPGAs, RV32IM `include "branch_predicates.v" // Tests for branch instructions `include "decoder.v" // The instruction decoder `include "aligned_memory_access.v" // Read/write bytes, hwords and words from memory `include "CSR_file.v" // (Optional) Control and Status registers /********************* Nrv processor *******************************/ module FemtoRV32 #( parameter [0:0] RV32M = 0, // Set to 1 to support mul/div/rem instructions parameter ADDR_WIDTH = 16 // width of the address bus ) ( input clk, // Memory interface: using the same protocol as Claire Wolf's picoR32 // (WIP: add mem_valid / mem_ready protocol) output [31:0] mem_addr, // address bus, only ADDR_WIDTH bits are used output wire [31:0] mem_wdata, // data to be written output wire [3:0] mem_wmask, // write mask for individual bytes (1 means write byte) input [31:0] mem_rdata, // input lines for both data and instr output wire mem_rstrb, // active to initiate memory read input wire mem_rbusy, // asserted if memory is busy reading value input wire mem_wbusy, // asserted if memory is busy writing value input wire reset, // set to 0 to reset the processor output wire error // 1 if current instruction could not be decoded ); // The internal register that stores the current address, // directly wired to the address bus. reg [ADDR_WIDTH-1:0] addressReg; // The program counter (not storing the two LSBs, always aligned) reg [ADDR_WIDTH-3:0] PC; assign mem_addr = addressReg; reg [31:0] instr; // Latched instruction. reg [31:0] nextInstr; // Prefetched instruction. // Next program counter in normal operation: advance one word // I do not use the ALU, I create an additional adder for that. // (not that the two LSBs are not stored, always aligned). wire [ADDR_WIDTH-3:0] PCplus4 = PC + 1; /**************************************************************************************************/ // Instruction decoding. // Internal signals, all generated by the decoder from the current instruction. wire [4:0] writeBackRegId; // The register to be written back wire writeBackEn; // Needs to be asserted for writing back wire [3:0] writeBackSel; // 0001: ALU 0010: PC+4 0100: RAM 1000: CSR wire [4:0] regId1; // Register output 1 wire [4:0] regId2; // Register output 2 wire aluInSel1; // 0: register 1: pc wire aluInSel2; // 0: register 1: imm wire aluSel; // 0: force aluOp,aluQual to zero (ADD) 1: use aluOp,aluQual from instr field wire [2:0] aluOp; // one of the 8 operations done by the ALU wire aluQual; // 'qualifier' used by some operations (+/-, logical/arith shifts) wire aluM; // asserted if instr is RV32M. wire [31:0] imm; // immediate value decoded from the instruction wire needWaitALU; // asserted if instruction uses at least one additional phase in ALU wire isLoad; // guess what wire isStore; // guess what wire isJump; // guess what wire isBranch; // guess what wire decoderError; // true if instr does not correspond to any known instr // The instruction decoder, that reads the current instruction // and generates all the signals from it. It is in fact just a // big combinatorial function. NrvDecoder decoder( .instr(instr), .writeBackRegId(writeBackRegId), .writeBackEn(writeBackEn), .writeBackSel(writeBackSel), .inRegId1(regId1), .inRegId2(regId2), .aluInSel1(aluInSel1), .aluInSel2(aluInSel2), .aluSel(aluSel), .aluOp(aluOp), .aluQual(aluQual), .aluM(aluM), .needWaitALU(needWaitALU), .isLoad(isLoad), .isStore(isStore), .isJump(isJump), .isBranch(isBranch), .imm(imm), .error(decoderError) ); /**************************************************************************************************/ // Maybe not necessary, but I'd rather latch this one, // if this one glitches, then it will break everything... reg error_latched; assign error = error_latched; /**************************************************************************************************/ // The register file. At each cycle, it can read two // registers (available at next cycle) and write one. wire writeBack; reg [31:0] writeBackData; wire [31:0] regOut1; wire [31:0] regOut2; NrvRegisterFile regs( .clk(clk), .in(writeBackData), .inEn(writeBack), .inRegId(writeBackRegId), .outRegId1(regId1), .outRegId2(regId2), .out1(regOut1), .out2(regOut2) ); /**************************************************************************************************/ // The ALU, partly combinatorial, partly state (for shifts). wire [31:0] aluOut; wire aluBusy; wire alu_wenable; wire [31:0] aluIn1 = aluInSel1 ? {PC, 2'b00} : regOut1; wire [31:0] aluIn2 = aluInSel2 ? imm : regOut2; // Select the ALU based on RV32M (use large ALU) or plain RV32I (use small ALU) generate if(RV32M) begin NrvLargeALU alu( .clk(clk), .in1(aluIn1), .in2(aluIn2), .op(aluOp & {3{aluSel}}), .opqual(aluQual & aluSel), .opM(aluM), .out(aluOut), .wr(alu_wenable), .busy(aluBusy) ); end else begin NrvSmallALU #( `ifdef NRV_TWOSTAGE_SHIFTER .TWOSTAGE_SHIFTER(1) `else .TWOSTAGE_SHIFTER(0) `endif ) alu( .clk(clk), .in1(aluIn1), .in2(aluIn2), .op(aluOp & {3{aluSel}}), .opqual(aluQual & aluSel), .out(aluOut), .wr(alu_wenable), .busy(aluBusy) ); end endgenerate /****************************************************************************/ // Memory only does 32-bit aligned accesses. Internally we have two small // circuits (one for LOAD and one for STORE) that shift and adapt data // according to data type (byte, halfword, word) and memory alignment (addr[1:0]). // In addition, it does sign-expansion (when loading a signed byte to a word for // instance). // LOAD: a small combinatorial circuit that realigns // and sign-expands mem_rdata based // on width (aluOp[1:0]), signed/unsigned flag (aluOp[2]) // and the two LSBs of the address. wire [31:0] LOAD_mem_rdata_aligned; NrvLoadFromMemory load_from_mem( .mem_rdata(mem_rdata), // Raw data read from mem .addr_LSBs(mem_addr[1:0]), // The two LSBs of the address .width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word .is_unsigned(aluOp[2]), // signed/unsigned flag .data(LOAD_mem_rdata_aligned) // Data ready to be sent to register ); // STORE: a small combinatorial circuit that realigns // data to be written based on width and the two LSBs // of the address. // When a STORE instruction is executed, the data to be stored to // mem is available from the second register (regOut2) and the // address where to store it is the output of the ALU (aluOut). wire mem_wenable; NrvStoreToMemory store_to_mem( .data(regOut2), // Data to be sent, out of register .addr_LSBs(aluOut[1:0]), // The two LSBs of the address .width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word .mem_wdata(mem_wdata), // Shifted data to be sent to memory .mem_wmask(mem_wmask), // Write mask for the 4 bytes .wr_enable(mem_wenable) // Write enable ('anded' with write mask) ); /*************************************************************************/ // Control and status registers `ifdef NRV_CSR wire [31:0] CSR_rdata; wire instr_retired; NrvControlStatusRegisterFile CSR( .clk(clk), // for counting cycles .instr_cnt(instr_retired), // for counting retired instructions .reset(reset), // reset all CSRs to default value .CSRid(instr[31:20]), // CSR Id, extracted from instr .rdata(CSR_rdata) // Read CSR value // TODO: test for errors (.error) ); `endif // Note: writing to CSRs not implemented yet /*************************************************************************/ // The value written back to the register file. always @(*) begin (* parallel_case, full_case *) case(1'b1) writeBackSel[0]: writeBackData = aluOut; writeBackSel[1]: writeBackData = {PCplus4, 2'b00}; writeBackSel[2]: writeBackData = LOAD_mem_rdata_aligned; `ifdef NRV_CSR writeBackSel[3]: writeBackData = CSR_rdata; `endif endcase end /*************************************************************************/ // The predicate for conditional branches. wire predOut; NrvPredicate pred( .in1(regOut1), .in2(regOut2), .op(aluOp), .out(predOut) ); /*************************************************************************/ // And, last but not least, the state machine. /*************************************************************************/ // The states, using 1-hot encoding (reduces // both LUT count and critical path). localparam INITIAL = 8'b00000000; localparam WAIT_INSTR = 8'b00000001; localparam FETCH_INSTR = 8'b00000010; localparam USE_PREFETCHED_INSTR = 8'b00000100; localparam FETCH_REGS = 8'b00001000; localparam EXECUTE = 8'b00010000; localparam WAIT_ALU_OR_DATA = 8'b00100000; localparam LOAD = 8'b01000000; localparam ERROR = 8'b10000000; localparam WAIT_INSTR_bit = 0; localparam FETCH_INSTR_bit = 1; localparam USE_PREFETCHED_INSTR_bit = 2; localparam FETCH_REGS_bit = 3; localparam EXECUTE_bit = 4; localparam WAIT_ALU_OR_DATA_bit = 5; localparam LOAD_bit = 6; localparam ERROR_bit = 7; reg [7:0] state = INITIAL; // the internal signals that are determined combinatorially from // state and other signals. // The internal signal that enables register write-back assign writeBack = (state[EXECUTE_bit] && writeBackEn) || state[WAIT_ALU_OR_DATA_bit]; // The memory-read signal. It is only needed for IO, hence it is only enabled // right before the LOAD state. To allow execution from IO-mapped devices, it // will be necessary to also enable it before instruction fetch. assign mem_rstrb = (state[EXECUTE_bit] && isLoad); // NOTE: memory write are done during the USE_PREFETCHED_INSTR state, // Can't be done during EXECUTE (would be better), because mem_addr // (needed) is updated at the end of EXECUTE. // See also how load_from_mem and store_to_mem are wired. assign mem_wenable = (state[USE_PREFETCHED_INSTR_bit] && isStore); // alu_wenable starts computation in the ALU (for functions that // require several cycles). assign alu_wenable = (state[EXECUTE_bit]); // instr_retired is asserted during one cycle for each // retired instructions. It is used to update the instruction // counter 'instret' in the control and status registers `ifdef NRV_CSR assign instr_retired = state[FETCH_REGS_bit]; `endif // And now the state machine `define show_state(state) `verbose($display(" %s",state)) always @(posedge clk) begin if(!reset) begin state <= INITIAL; addressReg <= 0; PC <= 0; end else case(1'b1) (state == 0): begin `show_state("initial"); state <= WAIT_INSTR; end state[WAIT_INSTR_bit]: begin `show_state("wait_instr"); // this state to give enough time to fetch the // instruction. Used for jumps and taken branches (and // when fetching the first instruction). state <= FETCH_INSTR; end state[FETCH_INSTR_bit]: begin `show_state("fetch_instr"); instr <= mem_rdata; // update instr address so that next instr is fetched during // decode (and ready if there was no jump or branch) addressReg <= {PCplus4, 2'b00}; state <= FETCH_REGS; end state[USE_PREFETCHED_INSTR_bit]: begin `show_state("use_prefetched_instr"); // for linear execution flow, the prefetched isntr (nextInstr) // can be used. instr <= nextInstr; // update instr address so that next instr is fetched during // decode (and ready if there was no jump or branch) addressReg <= {PCplus4, 2'b00}; // In addition, STORE instructions write to memory here. // (see NrvStoreToMemory store_to_mem at beginning of file). state <= FETCH_REGS; end state[FETCH_REGS_bit]: begin `show_state("fetch_regs"); // instr was just updated -> input register ids also // input registers available at next cycle state <= EXECUTE; error_latched <= decoderError; end state[EXECUTE_bit]: begin `show_state("execute"); // input registers are read, aluOut is up to date // Looked-ahead instr. nextInstr <= mem_rdata; // Needed for LOAD,STORE,jump,branch // (in other cases it will be ignored) addressReg <= aluOut; if(error_latched) begin state <= ERROR; end else if(isLoad) begin state <= LOAD; PC <= PCplus4; end else begin (* parallel_case, full_case *) case(1'b1) isJump: begin PC <= aluOut[31:2]; state <= WAIT_INSTR; end isBranch: begin if(predOut) begin PC <= aluOut[31:2]; state <= WAIT_INSTR; end else begin PC <= PCplus4; state <= USE_PREFETCHED_INSTR; end end default: begin // linear execution flow PC <= PCplus4; state <= needWaitALU ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR; end endcase end end state[LOAD_bit]: begin `show_state("load"); // data address (aluOut) was just updated // data ready at next cycle // we go to WAIT_ALU_OR_DATA to write back read data state <= WAIT_ALU_OR_DATA; end state[WAIT_ALU_OR_DATA_bit]: begin `show_state("wait_alu_or_data"); // - If ALU is still busy, continue to wait. // - register writeback is active state <= aluBusy ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR; end state[ERROR_bit]: begin `bench($display("ERROR")); state <= ERROR; end default: begin `bench($display("UNKNOWN STATE")); state <= ERROR; end endcase end /*********************************************************************/ `define show_opcode(opcode) `verbose($display("%x: %s",{PC,2'b00},opcode)) `ifdef BENCH always @(posedge clk) begin if(state[FETCH_REGS_bit]) begin case(instr[6:0]) 7'b0110111: `show_opcode("LUI"); 7'b0010111: `show_opcode("AUIPC"); 7'b1101111: `show_opcode("JAL"); 7'b1100111: `show_opcode("JALR"); 7'b1100011: `show_opcode("BRANCH"); 7'b0010011: `show_opcode("ALU reg imm"); 7'b0110011: `show_opcode("ALU reg reg"); 7'b0000011: `show_opcode("LOAD"); 7'b0100011: `show_opcode("STORE"); 7'b0001111: `show_opcode("FENCE"); 7'b1110011: `show_opcode("SYSTEM"); endcase // case (instr[6:0]) end // if (state[EXECUTE_bit]) end `endif endmodule