newStep.v

This commit is contained in:
2025-11-27 04:28:54 +03:00
parent a84b8fcfde
commit 6e38a6c1af
85 changed files with 25646 additions and 6801 deletions

188
RTL/Attic/mini_decoder.v Normal file
View File

@@ -0,0 +1,188 @@
/********************* Instruction decoder *******************************/
// A drop-in replacement of the instruction decoder, meant to further
// reduce LUT count by not checking for errors.
// Optimized by @mecrisp
// in femtorv32.v, replace `include "decoder.v"
// with `include "mini_decoder.v"
// (does not seem to save many LUTs with my version of YOSYS, but it depends).
// NOTE: the structure of the decoder has changed, *** NEEDS TO BE ADAPTED ***
module NrvDecoder(
input wire [31:0] instr,
output wire [4:0] writeBackRegId,
output reg writeBackEn,
output reg [3:0] writeBackSel, // 0001: ALU 0010: PC+4 0100: RAM 1000: counters
// (could use 2 wires instead, but using 4 wires (1-hot encoding)
// reduces both LUT count and critical path in the end !)
output wire [4:0] inRegId1,
output wire [4:0] inRegId2,
output reg aluSel, // 0: force aluOp,aluQual to zero (ADD) 1: use aluOp,aluQual from instr field
output reg aluInSel1, // 0: reg 1: pc
output reg aluInSel2, // 0: reg 1: imm
output [2:0] aluOp,
output reg aluQual,
output wire aluM, // Asserted if operation is an RV32M operation
output reg isLoad,
output reg isStore,
output reg isJump,
output reg isBranch,
output reg needWaitALU,
output reg [31:0] imm,
output wire error
);
assign error = 1'b0; // We do not check for errors in the MiniDecoder.
assign aluM = 1'b0; // MiniDecoder only works for RV32I
reg inRegId1Sel; // 0: force inRegId1 to zero 1: use inRegId1 instr field
assign writeBackRegId = instr[11:7];
assign inRegId1 = instr[19:15] & {5{inRegId1Sel}}; // Internal sig InRegId1Sel used to force zero in reg1
assign inRegId2 = instr[24:20]; // (because I'm making maximum reuse of the adder of the ALU)
assign aluOp = instr[14:12];
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25], instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12], instr[20], instr[30:21], 1'b0};
wire [31:0] Uimm = {instr[31], instr[30:12], {12{1'b0}}};
// The rest of instruction decoding, for the following signals:
// writeBackEn
// writeBackSel 0001: ALU 0010: PC+4 0100: RAM 1000: counters
// inRegId1Sel 0: zero 1: regId
// aluInSel1 0: reg 1: PC
// aluInSel2 0: reg 1: imm
// aluQual +/- SRLI/SRAI
// aluM 1 if instr is RV32M
// aluSel 0: force aluOp,aluQual=00 1: use aluOp/aluQual
// nextPCSel 001: PC+4 010: ALU 100: (pred ? ALU : PC+4)
// imm (select one of Iimm,Simm,Bimm,Jimm,Uimm)
// We need to distingish shifts for two reasons:
// - We need to wait for ALU when it is a shift
// - For ALU ops with immediates, aluQual is 0, except
// for shifts (then it is instr[30]).
wire aluOpIsShift = (aluOp == 3'b001) || (aluOp == 3'b101);
always @(*) begin
inRegId1Sel = 1'b1; // reg 1 Id from instr
isLoad = 1'b0;
isStore = 1'b0;
isJump = 1'b0;
isBranch = 1'b0;
aluQual = 1'b0;
needWaitALU = 1'b0;
(* parallel_case, full_case *)
casez(instr[6:2])
5'b011?1: begin // LUI
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0001; // write back source = ALU
inRegId1Sel = 1'b0; // reg 1 Id = 0
aluInSel1 = 1'b0; // ALU source 1 = reg
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
imm = Uimm; // imm format = U
end
5'b001?1: begin // AUIPC
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0001; // write back source = ALU
inRegId1Sel = 1'bx; // reg 1 Id : don't care (we use PC)
aluInSel1 = 1'b1; // ALU source 1 = PC
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
imm = Uimm; // imm format = U
end
5'b11011: begin // JAL
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0010; // write back source = PC+4
inRegId1Sel = 1'bx; // reg 1 Id : don't care (we use PC)
aluInSel1 = 1'b1; // ALU source 1 = PC
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
isJump = 1'b1; // PC <- ALU
imm = Jimm; // imm format = J
end
5'b11001: begin // JALR
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0010; // write back source = PC+4
aluInSel1 = 1'b0; // ALU source 1 = reg
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
isJump = 1'b1; // PC <- ALU
imm = Iimm; // imm format = I
end
5'b110?0: begin // Branch
writeBackEn = 1'b0; // disable write back
writeBackSel = 4'bxxxx; // write back source = don't care
aluInSel1 = 1'b1; // ALU source 1 : PC
aluInSel2 = 1'b1; // ALU source 2 : imm
aluSel = 1'b0; // ALU op = ADD
isBranch = 1'b1; // PC <- pred ? ALU : PC+4
imm = Bimm; // imm format = B
end
5'b001?0: begin // ALU operation: Register,Immediate
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0001; // write back source = ALU
aluInSel1 = 1'b0; // ALU source 1 : reg
aluInSel2 = 1'b1; // ALU source 2 : imm
// Qualifier for ALU op: SRLI/SRAI
aluQual = aluOpIsShift ? instr[30] : 1'b0;
needWaitALU = aluOpIsShift;
aluSel = 1'b1; // ALU op : from instr
imm = Iimm; // imm format = I
end
5'b011?0: begin // ALU operation: Register,Register
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0001; // write back source = ALU
aluInSel1 = 1'b0; // ALU source 1 : reg
aluInSel2 = 1'b0; // ALU source 2 : reg
aluQual = instr[30]; // Qualifier for ALU op: +/- SRL/SRA
aluSel = 1'b1; // ALU op : from instr
needWaitALU = aluOpIsShift;
imm = 32'bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx; // don't care
end
5'b000?0: begin // Load
writeBackEn = 1'b1; // enable write back
writeBackSel = 4'b0100; // write back source = RAM
aluInSel1 = 1'b0; // ALU source 1 = reg
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
imm = Iimm; // imm format = I
isLoad = 1'b1;
end
5'b010?0: begin // Store
writeBackEn = 1'b0; // disable write back
writeBackSel = 4'bxxxx; // write back sel = don't care
aluInSel1 = 1'b0; // ALU source 1 = reg
aluInSel2 = 1'b1; // ALU source 2 = imm
aluSel = 1'b0; // ALU op = ADD
imm = Simm; // imm format = S
isStore = 1'b1;
end
default: begin
writeBackEn = 1'b0;
writeBackSel = 4'bxxxx;
inRegId1Sel = 1'bx;
aluInSel1 = 1'bx;
aluInSel2 = 1'bx;
aluSel = 1'bx;
imm = 32'bxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;
end
endcase
end
endmodule

456
RTL/Attic/mini_femtorv32.v Normal file
View File

@@ -0,0 +1,456 @@
// femtorv32, a minimalistic RISC-V RV32I core
// (minus SYSTEM and FENCE that are not implemented)
// Bruno Levy, May-June 2020
//
// drop-in replacement of femtorv32,
// does 3 CPIs (cycles per instructions) in linear execution flow
// (two be compared with 2 CPIs with femtorv32.v),
// saves 20-50 LUTs
// in femtosoc.v, replace `include "femtorv32.v"
// with `include "mini_femtorv32.v"
//
// NOTE: the structure of the decoder has changed, *** NEEDS TO BE ADAPTED ***
/*******************************************************************/
`include "utils.v" // Utilities, macros for debugging
`include "register_file.v" // The 31 general-purpose registers
`include "small_alu.v" // Used on IceStick, RV32I
`include "large_alu.v" // For larger FPGAs, RV32IM
`include "branch_predicates.v" // Tests for branch instructions
`include "decoder.v" // The instruction decoder
`include "aligned_memory_access.v" // Read/write bytes, hwords and words from memory
`include "CSR_file.v" // (Optional) Control and Status registers
/********************* Nrv processor *******************************/
module FemtoRV32 #(
parameter [0:0] RV32M = 0, // Set to 1 to support mul/div/rem instructions
parameter ADDR_WIDTH = 16 // width of the address bus
) (
input clk,
// Memory interface: using the same protocol as Claire Wolf's picoR32
// (WIP: add mem_valid / mem_ready protocol)
output [31:0] mem_addr, // address bus, only ADDR_WIDTH bits are used
output wire [31:0] mem_wdata, // data to be written
output wire [3:0] mem_wmask, // write mask for individual bytes (1 means write byte)
input [31:0] mem_rdata, // input lines for both data and instr
output wire mem_rstrb, // active to initiate memory read
input wire mem_rbusy, // asserted if memory is busy reading value
input wire mem_wbusy, // asserted if memory is busy writing value
input wire reset, // set to 0 to reset the processor
output wire error // 1 if current instruction could not be decoded
);
// The internal register that stores the current address,
// directly wired to the address bus.
reg [ADDR_WIDTH-1:0] addressReg;
// The program counter (not storing the two LSBs, always aligned)
reg [ADDR_WIDTH-3:0] PC;
assign mem_addr = addressReg;
reg [31:0] instr; // Latched instruction.
reg [31:0] nextInstr; // Prefetched instruction.
// Next program counter in normal operation: advance one word
// I do not use the ALU, I create an additional adder for that.
// (not that the two LSBs are not stored, always aligned).
wire [ADDR_WIDTH-3:0] PCplus4 = PC + 1;
/**************************************************************************************************/
// Instruction decoding.
// Internal signals, all generated by the decoder from the current instruction.
wire [4:0] writeBackRegId; // The register to be written back
wire writeBackEn; // Needs to be asserted for writing back
wire [3:0] writeBackSel; // 0001: ALU 0010: PC+4 0100: RAM 1000: CSR
wire [4:0] regId1; // Register output 1
wire [4:0] regId2; // Register output 2
wire aluInSel1; // 0: register 1: pc
wire aluInSel2; // 0: register 1: imm
wire aluSel; // 0: force aluOp,aluQual to zero (ADD) 1: use aluOp,aluQual from instr field
wire [2:0] aluOp; // one of the 8 operations done by the ALU
wire aluQual; // 'qualifier' used by some operations (+/-, logical/arith shifts)
wire aluM; // asserted if instr is RV32M.
wire [31:0] imm; // immediate value decoded from the instruction
wire needWaitALU; // asserted if instruction uses at least one additional phase in ALU
wire isLoad; // guess what
wire isStore; // guess what
wire isJump; // guess what
wire isBranch; // guess what
wire decoderError; // true if instr does not correspond to any known instr
// The instruction decoder, that reads the current instruction
// and generates all the signals from it. It is in fact just a
// big combinatorial function.
NrvDecoder decoder(
.instr(instr),
.writeBackRegId(writeBackRegId),
.writeBackEn(writeBackEn),
.writeBackSel(writeBackSel),
.inRegId1(regId1),
.inRegId2(regId2),
.aluInSel1(aluInSel1),
.aluInSel2(aluInSel2),
.aluSel(aluSel),
.aluOp(aluOp),
.aluQual(aluQual),
.aluM(aluM),
.needWaitALU(needWaitALU),
.isLoad(isLoad),
.isStore(isStore),
.isJump(isJump),
.isBranch(isBranch),
.imm(imm),
.error(decoderError)
);
/**************************************************************************************************/
// Maybe not necessary, but I'd rather latch this one,
// if this one glitches, then it will break everything...
reg error_latched;
assign error = error_latched;
/**************************************************************************************************/
// The register file. At each cycle, it can read two
// registers (available at next cycle) and write one.
wire writeBack;
reg [31:0] writeBackData;
wire [31:0] regOut1;
wire [31:0] regOut2;
NrvRegisterFile regs(
.clk(clk),
.in(writeBackData),
.inEn(writeBack),
.inRegId(writeBackRegId),
.outRegId1(regId1),
.outRegId2(regId2),
.out1(regOut1),
.out2(regOut2)
);
/**************************************************************************************************/
// The ALU, partly combinatorial, partly state (for shifts).
wire [31:0] aluOut;
wire aluBusy;
wire alu_wenable;
wire [31:0] aluIn1 = aluInSel1 ? {PC, 2'b00} : regOut1;
wire [31:0] aluIn2 = aluInSel2 ? imm : regOut2;
// Select the ALU based on RV32M (use large ALU) or plain RV32I (use small ALU)
generate
if(RV32M) begin
NrvLargeALU alu(
.clk(clk),
.in1(aluIn1),
.in2(aluIn2),
.op(aluOp & {3{aluSel}}),
.opqual(aluQual & aluSel),
.opM(aluM),
.out(aluOut),
.wr(alu_wenable),
.busy(aluBusy)
);
end else begin
NrvSmallALU #(
`ifdef NRV_TWOSTAGE_SHIFTER
.TWOSTAGE_SHIFTER(1)
`else
.TWOSTAGE_SHIFTER(0)
`endif
) alu(
.clk(clk),
.in1(aluIn1),
.in2(aluIn2),
.op(aluOp & {3{aluSel}}),
.opqual(aluQual & aluSel),
.out(aluOut),
.wr(alu_wenable),
.busy(aluBusy)
);
end
endgenerate
/****************************************************************************/
// Memory only does 32-bit aligned accesses. Internally we have two small
// circuits (one for LOAD and one for STORE) that shift and adapt data
// according to data type (byte, halfword, word) and memory alignment (addr[1:0]).
// In addition, it does sign-expansion (when loading a signed byte to a word for
// instance).
// LOAD: a small combinatorial circuit that realigns
// and sign-expands mem_rdata based
// on width (aluOp[1:0]), signed/unsigned flag (aluOp[2])
// and the two LSBs of the address.
wire [31:0] LOAD_mem_rdata_aligned;
NrvLoadFromMemory load_from_mem(
.mem_rdata(mem_rdata), // Raw data read from mem
.addr_LSBs(mem_addr[1:0]), // The two LSBs of the address
.width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word
.is_unsigned(aluOp[2]), // signed/unsigned flag
.data(LOAD_mem_rdata_aligned) // Data ready to be sent to register
);
// STORE: a small combinatorial circuit that realigns
// data to be written based on width and the two LSBs
// of the address.
// When a STORE instruction is executed, the data to be stored to
// mem is available from the second register (regOut2) and the
// address where to store it is the output of the ALU (aluOut).
wire mem_wenable;
NrvStoreToMemory store_to_mem(
.data(regOut2), // Data to be sent, out of register
.addr_LSBs(aluOut[1:0]), // The two LSBs of the address
.width(aluOp[1:0]), // Data width: 00:byte 01:hword 10:word
.mem_wdata(mem_wdata), // Shifted data to be sent to memory
.mem_wmask(mem_wmask), // Write mask for the 4 bytes
.wr_enable(mem_wenable) // Write enable ('anded' with write mask)
);
/*************************************************************************/
// Control and status registers
`ifdef NRV_CSR
wire [31:0] CSR_rdata;
wire instr_retired;
NrvControlStatusRegisterFile CSR(
.clk(clk), // for counting cycles
.instr_cnt(instr_retired), // for counting retired instructions
.reset(reset), // reset all CSRs to default value
.CSRid(instr[31:20]), // CSR Id, extracted from instr
.rdata(CSR_rdata) // Read CSR value
// TODO: test for errors (.error)
);
`endif
// Note: writing to CSRs not implemented yet
/*************************************************************************/
// The value written back to the register file.
always @(*) begin
(* parallel_case, full_case *)
case(1'b1)
writeBackSel[0]: writeBackData = aluOut;
writeBackSel[1]: writeBackData = {PCplus4, 2'b00};
writeBackSel[2]: writeBackData = LOAD_mem_rdata_aligned;
`ifdef NRV_CSR
writeBackSel[3]: writeBackData = CSR_rdata;
`endif
endcase
end
/*************************************************************************/
// The predicate for conditional branches.
wire predOut;
NrvPredicate pred(
.in1(regOut1),
.in2(regOut2),
.op(aluOp),
.out(predOut)
);
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
// The states, using 1-hot encoding (reduces
// both LUT count and critical path).
localparam INITIAL = 8'b00000000;
localparam WAIT_INSTR = 8'b00000001;
localparam FETCH_INSTR = 8'b00000010;
localparam USE_PREFETCHED_INSTR = 8'b00000100;
localparam FETCH_REGS = 8'b00001000;
localparam EXECUTE = 8'b00010000;
localparam WAIT_ALU_OR_DATA = 8'b00100000;
localparam LOAD = 8'b01000000;
localparam ERROR = 8'b10000000;
localparam WAIT_INSTR_bit = 0;
localparam FETCH_INSTR_bit = 1;
localparam USE_PREFETCHED_INSTR_bit = 2;
localparam FETCH_REGS_bit = 3;
localparam EXECUTE_bit = 4;
localparam WAIT_ALU_OR_DATA_bit = 5;
localparam LOAD_bit = 6;
localparam ERROR_bit = 7;
reg [7:0] state = INITIAL;
// the internal signals that are determined combinatorially from
// state and other signals.
// The internal signal that enables register write-back
assign writeBack = (state[EXECUTE_bit] && writeBackEn) || state[WAIT_ALU_OR_DATA_bit];
// The memory-read signal. It is only needed for IO, hence it is only enabled
// right before the LOAD state. To allow execution from IO-mapped devices, it
// will be necessary to also enable it before instruction fetch.
assign mem_rstrb = (state[EXECUTE_bit] && isLoad);
// NOTE: memory write are done during the USE_PREFETCHED_INSTR state,
// Can't be done during EXECUTE (would be better), because mem_addr
// (needed) is updated at the end of EXECUTE.
// See also how load_from_mem and store_to_mem are wired.
assign mem_wenable = (state[USE_PREFETCHED_INSTR_bit] && isStore);
// alu_wenable starts computation in the ALU (for functions that
// require several cycles).
assign alu_wenable = (state[EXECUTE_bit]);
// instr_retired is asserted during one cycle for each
// retired instructions. It is used to update the instruction
// counter 'instret' in the control and status registers
`ifdef NRV_CSR
assign instr_retired = state[FETCH_REGS_bit];
`endif
// And now the state machine
`define show_state(state) `verbose($display(" %s",state))
always @(posedge clk) begin
if(!reset) begin
state <= INITIAL;
addressReg <= 0;
PC <= 0;
end else
case(1'b1)
(state == 0): begin
`show_state("initial");
state <= WAIT_INSTR;
end
state[WAIT_INSTR_bit]: begin
`show_state("wait_instr");
// this state to give enough time to fetch the
// instruction. Used for jumps and taken branches (and
// when fetching the first instruction).
state <= FETCH_INSTR;
end
state[FETCH_INSTR_bit]: begin
`show_state("fetch_instr");
instr <= mem_rdata;
// update instr address so that next instr is fetched during
// decode (and ready if there was no jump or branch)
addressReg <= {PCplus4, 2'b00};
state <= FETCH_REGS;
end
state[USE_PREFETCHED_INSTR_bit]: begin
`show_state("use_prefetched_instr");
// for linear execution flow, the prefetched isntr (nextInstr)
// can be used.
instr <= nextInstr;
// update instr address so that next instr is fetched during
// decode (and ready if there was no jump or branch)
addressReg <= {PCplus4, 2'b00};
// In addition, STORE instructions write to memory here.
// (see NrvStoreToMemory store_to_mem at beginning of file).
state <= FETCH_REGS;
end
state[FETCH_REGS_bit]: begin
`show_state("fetch_regs");
// instr was just updated -> input register ids also
// input registers available at next cycle
state <= EXECUTE;
error_latched <= decoderError;
end
state[EXECUTE_bit]: begin
`show_state("execute");
// input registers are read, aluOut is up to date
// Looked-ahead instr.
nextInstr <= mem_rdata;
// Needed for LOAD,STORE,jump,branch
// (in other cases it will be ignored)
addressReg <= aluOut;
if(error_latched) begin
state <= ERROR;
end else if(isLoad) begin
state <= LOAD;
PC <= PCplus4;
end else begin
(* parallel_case, full_case *)
case(1'b1)
isJump: begin
PC <= aluOut[31:2];
state <= WAIT_INSTR;
end
isBranch: begin
if(predOut) begin
PC <= aluOut[31:2];
state <= WAIT_INSTR;
end else begin
PC <= PCplus4;
state <= USE_PREFETCHED_INSTR;
end
end
default: begin // linear execution flow
PC <= PCplus4;
state <= needWaitALU ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR;
end
endcase
end
end
state[LOAD_bit]: begin
`show_state("load");
// data address (aluOut) was just updated
// data ready at next cycle
// we go to WAIT_ALU_OR_DATA to write back read data
state <= WAIT_ALU_OR_DATA;
end
state[WAIT_ALU_OR_DATA_bit]: begin
`show_state("wait_alu_or_data");
// - If ALU is still busy, continue to wait.
// - register writeback is active
state <= aluBusy ? WAIT_ALU_OR_DATA : USE_PREFETCHED_INSTR;
end
state[ERROR_bit]: begin
`bench($display("ERROR"));
state <= ERROR;
end
default: begin
`bench($display("UNKNOWN STATE"));
state <= ERROR;
end
endcase
end
/*********************************************************************/
`define show_opcode(opcode) `verbose($display("%x: %s",{PC,2'b00},opcode))
`ifdef BENCH
always @(posedge clk) begin
if(state[FETCH_REGS_bit]) begin
case(instr[6:0])
7'b0110111: `show_opcode("LUI");
7'b0010111: `show_opcode("AUIPC");
7'b1101111: `show_opcode("JAL");
7'b1100111: `show_opcode("JALR");
7'b1100011: `show_opcode("BRANCH");
7'b0010011: `show_opcode("ALU reg imm");
7'b0110011: `show_opcode("ALU reg reg");
7'b0000011: `show_opcode("LOAD");
7'b0100011: `show_opcode("STORE");
7'b0001111: `show_opcode("FENCE");
7'b1110011: `show_opcode("SYSTEM");
endcase // case (instr[6:0])
end // if (state[EXECUTE_bit])
end
`endif
endmodule

42
RTL/CONFIGS/arty_config.v Normal file
View File

@@ -0,0 +1,42 @@
// Default femtosoc configuration file for ARTY
/*** Devices ******************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 = errors)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLED screen
//`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
//`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space.
/*** Processor configuration **************************************************/
`define NRV_FREQ 70 // Frequency in MHz, needs to be a multiple of 5
// CORE RV32 subset fmax validated-experimental
//
//`define NRV_FEMTORV32_QUARK // RV32I fmax = 80-110 MHz
//`define NRV_FEMTORV32_TACHYON // RV32I fmax = 100-135 MHz
//`define NRV_FEMTORV32_ELECTRON // RV32IM fmax = 70-80 MHz
//`define NRV_FEMTORV32_INTERMISSUM // RV32IM, IRQ fmax = 60-80 MHz
//`define NRV_FEMTORV32_GRACILIS // RV32IMC, IRQ fmax = 60-80 MHz
`define NRV_FEMTORV32_PETITBATEAU // RV32IMFC, IRQ fmax = 50-80 MHz
//`define NRV_FEMTORV32_TESTDRIVE
`define NRV_RESET_ADDR 0 // The address the processor jumps to on reset
/*** RAM (in bytes, needs to be a multiple of 4)*******************************/
`define NRV_RAM 65536
//`define NRV_RAM 262144 // On the ARTY, does not work with more than 64k,
// I don't know why.
/*** Advanced devices configuration *******************************************/
`define NRV_IO_HARDWARE_CONFIG // Hardware config registers mapped in IO-Space
// (note: firmware libfemtorv32 depends on it)
/******************************************************************************/
`define NRV_NEGATIVE_RESET // reset button active low
`define NRV_CONFIGURED

View File

@@ -0,0 +1,19 @@
`define NRV_IO_LEDS
`define NRV_IO_UART
`define NRV_IO_SSD1351
`define NRV_FREQ 1
//`define NRV_FEMTORV32_QUARK // RV32I (the most elementary femtorv)
//`define NRV_FEMTORV32_ELECTRON // RV32IM
//`define NRV_FEMTORV32_INTERMISSUM // RV32IMzCSR
//`define NRV_FEMTORV32_GRACILIS // RV32IMCzCSR
`define NRV_FEMTORV32_PETITBATEAU // WIP RF32F !!
//`define NRV_FEMTORV32_TESTDRIVE
`define NRV_RESET_ADDR 0
`define NRV_RAM 65536
`define NRV_IO_HARDWARE_CONFIG
`define NRV_CONFIGURED

View File

@@ -0,0 +1,42 @@
// Default femtosoc configuration file for ARTY
/*** Devices ******************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 = errors)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLED screen
//`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
//`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space.
/*** Processor configuration **************************************************/
`define NRV_FREQ 70 // Frequency in MHz, needs to be a multiple of 5
// CORE RV32 subset fmax validated-experimental
//
//`define NRV_FEMTORV32_QUARK // RV32I fmax = 80-110 MHz
//`define NRV_FEMTORV32_TACHYON // RV32I fmax = 100-135 MHz
//`define NRV_FEMTORV32_ELECTRON // RV32IM fmax = 70-80 MHz
//`define NRV_FEMTORV32_INTERMISSUM // RV32IM, IRQ fmax = 60-80 MHz
//`define NRV_FEMTORV32_GRACILIS // RV32IMC, IRQ fmax = 60-80 MHz
`define NRV_FEMTORV32_PETITBATEAU // RV32IMFC, IRQ fmax = 50-80 MHz
//`define NRV_FEMTORV32_TESTDRIVE
`define NRV_RESET_ADDR 0 // The address the processor jumps to on reset
/*** RAM (in bytes, needs to be a multiple of 4)*******************************/
`define NRV_RAM 65536
//`define NRV_RAM 262144 // On the ARTY, does not work with more than 64k,
// I don't know why.
/*** Advanced devices configuration *******************************************/
`define NRV_IO_HARDWARE_CONFIG // Hardware config registers mapped in IO-Space
// (note: firmware libfemtorv32 depends on it)
/******************************************************************************/
`define NRV_NEGATIVE_RESET // reset button active low
`define NRV_CONFIGURED

View File

@@ -0,0 +1,36 @@
// Default femtosoc configuration file for IceStick
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
//`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Use with MINIRV32 to run code from SPI flash.
/************************* Frequency ********************************************************************************/
`define NRV_FREQ 65 // Frequency in MHz.
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
`define NRV_RAM 262144 // RAM in bytes
/************************* Processor configuration ******************************************************************/
`define NRV_CSR // Uncomment if using something below (counters,...)
`define NRV_COUNTERS // Uncomment for instr and cycle counters (won't fit on the ICEStick)
`define NRV_COUNTERS_64 // ... and uncomment this one as well if you want 64-bit counters
`define NRV_RV32M // Uncomment for hardware mul and div support (RV32M instructions). Not supported on IceStick !
`define NRV_LATCH_ALU // Uncomment to latch all ALU ops (reduces critical path)
/************************* Advanced processor configuration *********************************************************/
`define NRV_RESET_ADDR 24'h000000
//`define NRV_RESET_ADDR 24'h810000 // Jump execution to SPI Flash (Mapped at 800000h, + leave 64k (10000h) for FPGA bitstream)
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (only if you use your own firmware, libfemtorv32 depends on it)
/******************************************************************************************************************/
//`define NRV_RUN_FROM_SPI_FLASH // Do not 'readmemh()' firmware from '.hex' file
`define NRV_CONFIGURED

View File

@@ -0,0 +1,38 @@
// Default femtosoc configuration file for IceStick
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
//`define NRV_IO_UART // Mapped IO, virtual UART (USB)
//`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLed screen
//`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
//`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Can be used with MINIRV32 to run code from SPI flash.
/************************* Frequency ********************************************************************************/
`define NRV_FREQ 50 // Frequency in MHz. Recomm: 50 MHz (FOMU: 16MHz) Overclocking: 80-100 MHz (HX1K, ECP5)
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
`define NRV_RAM 6144 // default for ICESTICK (cannot do more !)
//`define NRV_RAM 1024 // small ICESTICK config (to further save LUTs if need be)
/************************* Processor configuration ******************************************************************/
//`define NRV_CSR // Uncomment if using something below (counters,...)
//`define NRV_COUNTERS // Uncomment for instr and cycle counters (won't fit on the ICEStick)
//`define NRV_COUNTERS_64 // ... and uncomment this one as well if you want 64-bit counters
`define NRV_TWOSTAGE_SHIFTER // if not RV32M, comment-out if running out of LUTs (at the expense of slower shifts)
//`define NRV_LATCH_ALU // Uncomment to latch all ALU ops (reduces critical path)
/************************* Advanced processor configuration *********************************************************/
`define NRV_RESET_ADDR 0 // The address the processor jumps to on reset
//`define NRV_RESET_ADDR 32'h00800000 // If using NRV_MINIRV32 and mapped SPI Flash, you may want to jump to
// a bootloader or firmware stored there.
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (only if you use your own firmware, libfemtorv32 depends on it)
/******************************************************************************************************************/

View File

@@ -0,0 +1,46 @@
// Default femtosoc configuration file for IceStick
`define NRV_NEGATIVE_RESET
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLed screen
`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Can be used with MINIRV32 to run code from SPI flash.
/************************* Processor configuration ******************************************************************/
//`define NRV_FEMTORV32_QUARK_BICYCLE // RV32I
`define NRV_FEMTORV32_ELECTRON // RV32IM
//`define NRV_FEMTORV32_INTERMISSUM // RV32IM + IRQ
//`define NRV_FEMTORV32_GRACILIS // RV32IMC + IRQ
//`define NRV_FEMTORV32_PETITBATEAU // RV32IMFC + IRQ, does not fit on IceBreaker
`define NRV_FREQ 20 // Frequency in MHz. Recomm: 15 MHz Overclocking: 20-25 MHz
`define NRV_RESET_ADDR 32'h00820000 // Jump execution to SPI Flash (800000h, +128k(20000h) for FPGA bitstream)
// tinyraytracer: 30 MHz RV32IM electron 3:12
// 20 MHz RV32IM gracilis 3:44
// 20 MHz RV32IMC gracilis 3:32
// 25 MHz RV32IMC gracilis 2:49
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
// Using the 128 kbytes of single-ported RAM of the ice40-up5k
// Note: cannot initialize it from .hex file, need to run from SPI Flash
`define ICE40UP5K_SPRAM
`define NRV_RAM 131072
// (other option, the 12 kbytes of BRAM, this one can be initialized from .hex file).
//`define NRV_RAM 12288
/************************* Advanced devices configuration *********************************************************/
`define NRV_RUN_FROM_SPI_FLASH // Do not 'readmemh()' firmware from '.hex' file
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (only if you use your own firmware, libfemtorv32 depends on it)
/******************************************************************************************************************/
`define NRV_CONFIGURED

View File

@@ -0,0 +1,45 @@
// Default femtosoc configuration file for IceStick
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
`define NRV_IO_IRDA // In IO_LEDS, support for the IRDA on the IceStick (WIP)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLED screen
`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Can be used to run code from SPI flash.
/************************* Processor configuration *******************************************************************/
/*
`define NRV_FEMTORV32_TACHYON // "Tachyon" (carefully latched for max highfreq). Needs more space (remove MAX7219).
`define NRV_FREQ 60 // Validated at 60 MHz on the IceStick. Can overclock to 80-95 MHz.
`define NRV_RESET_ADDR 32'h00820000 // Jump execution to SPI Flash (800000h, +128k(20000h) for FPGA bitstream)
`define NRV_COUNTER_WIDTH 24 // Number of bits in cycles counter
`define NRV_TWOLEVEL_SHIFTER // Faster shifts
*/
// tinyraytracer: 90 MHz, 14:02
// 95 MHz, 13:18
`define NRV_FEMTORV32_QUARK
`define NRV_FREQ 50 // Validated at 50 MHz on the IceStick. Can overclock to 70 MHz.
`define NRV_RESET_ADDR 32'h00820000 // Jump execution to SPI Flash (800000h, +128k(20000h) for FPGA bitstream)
`define NRV_COUNTER_WIDTH 24 // Number of bits in cycles counter
`define NRV_TWOLEVEL_SHIFTER // Faster shifts
// tinyraytracer: 70 MHz, 17:30
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
`define NRV_RAM 6144 // default for ICESTICK (cannot do more !)
/************************* Advanced devices configuration ***********************************************************/
`define NRV_RUN_FROM_SPI_FLASH // Do not 'readmemh()' firmware from '.hex' file
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (note: firmware libfemtorv32 depends on it)
/********************************************************************************************************************/
`define NRV_CONFIGURED

View File

@@ -0,0 +1,35 @@
// Default femtosoc configuration file for iCESugar-nano (iCE40LP1KCM36)
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
//`define NRV_IO_IRDA // In IO_LEDS, support for the IRDA (WIP)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
//`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLED screen
//`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Can be used to run code from SPI flash.
/************************* Processor configuration *******************************************************************/
//`define NRV_FEMTORV32_TACHYON // "Tachyon" (carefully latched for max highfreq). Needs more space (remove MAX7219).
`define NRV_FEMTORV32_QUARK
`define NRV_FREQ 12 // 12 MHz is the default clock frequency in iCESugar-nano. Board max is 72 MHz.
`define NRV_RESET_ADDR 32'h00820000 // Jump execution to SPI Flash (800000h, +128k(20000h) for FPGA bitstream)
`define NRV_COUNTER_WIDTH 24 // Number of bits in cycles counter
//`define NRV_TWOLEVEL_SHIFTER // Faster shifts
//`define NRV_NEGATIVE_RESET
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
//`define NRV_RAM 4096 // 4kB, for less LUTs usage, edit spiflash_icesugar_nano.ld too for 4kB RAM
`define NRV_RAM 6144 // 6kB, default for iCESugar-nano (iCE40LP1KCM36) (cannot do more !)
/************************* Advanced devices configuration ***********************************************************/
`define NRV_RUN_FROM_SPI_FLASH // Do not 'readmemh()' firmware from '.hex' file
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (note: firmware libfemtorv32 depends on it)
/********************************************************************************************************************/
`define NRV_CONFIGURED

View File

@@ -0,0 +1,39 @@
// Default femtosoc configuration file for ULX3S
/************************* Devices **********************************************************************************/
`define NRV_IO_LEDS // Mapped IO, LEDs D1,D2,D3,D4 (D5 is used to display errors)
`define NRV_IO_UART // Mapped IO, virtual UART (USB)
`define NRV_IO_SSD1331 // Mapped IO, 96x64x64K OLed screen
//`define NRV_IO_SSD1351 // Mapped IO, 128x128x64K OLed screen
//`define NRV_IO_MAX7219 // Mapped IO, 8x8 led matrix
`define NRV_IO_SDCARD // Mapped IO, SPI SDCARD
`define NRV_IO_BUTTONS // Mapped IO, buttons
`define NRV_MAPPED_SPI_FLASH // SPI flash mapped in address space. Use with MINIRV32 to run code from SPI flash.
`define NRV_IO_FGA // Femto Graphic Adapter (ULX3S only)
/************************* Frequency ********************************************************************************/
`define NRV_FREQ 75 // Frequency in MHz. Recomm: 40 MHz Overclocking: 80 MHz
//`define NRV_FEMTORV32_QUARK // RV32I
//`define NRV_FEMTORV32_TACHYON // RV32I high freq
//`define NRV_FEMTORV32_QUARK_BICYCLE // RV32I 2 CPI
//`define NRV_FEMTORV32_ELECTRON // RV32IM
//`define NRV_FEMTORV32_GRACILIS // RV32IMC, IRQ
`define NRV_FEMTORV32_PETITBATEAU
`define NRV_RESET_ADDR 0 // The address the processor jumps to on reset
/************************* RAM (in bytes, needs to be a multiple of 4)***********************************************/
//`define NRV_RAM 393216 // bigger config for ULX3S
`define NRV_RAM 262144 // default for ULX3S
/************************* Advanced processor configuration *********************************************************/
`define NRV_IO_HARDWARE_CONFIG // Comment-out to disable hardware config registers mapped in IO-Space
// (only if you use your own firmware, libfemtorv32 depends on it)
/********************************************************************************************************************/
`define NRV_CONFIGURED

16
RTL/DEVICES/Buttons.v Normal file
View File

@@ -0,0 +1,16 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: driver for the buttons (does nearly nothing,
// could include some filtering here).
module Buttons(
input wire sel, // select (read/write ignored if low)
output wire [31:0] rdata, // read data
input wire[5:0] BUTTONS // the six pins wired to the buttons
);
assign rdata = (sel ? {26'b0, BUTTONS} : 32'b0);
endmodule

489
RTL/DEVICES/FGA.v Normal file
View File

@@ -0,0 +1,489 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: FGA: Femto Graphics Adapter
// Note: VRAM is write-only ! (the read port is used by HDMI)
//
// sel_cntl / io_wstrb / io_rstrb gives access to the set of control
// registers and commands:
//
// Write: set register: value[31:8] REG_XXX[7:0]
// command (1 arg): arg24[31:8] 1[7] CMD_XXX[6:0]
// command (2 args): arg12_1[31:20] arg12_2[19:8] 1[7] CMD_XXX[6:0]
//
// Read: the value of the register indicated by REG_READREGID
//
// Registers:
// REG_STATUS (0): vblank[31] hblank[30] drawarea[29] membusy[28] XXXX[27:24] Y[23:12] X[11:0]
// RESOLUTION (1): height[23:12] width[11:0]
// COLORMODE (2): colormapped[3] bpp[2:0] (0:1bpp 1:2bpp 2:4bpp 3:8bpp 4:16bpp)
// DISPLAYMODE (3): magnify[0]
// ORIGIN (4): origin_pixel_address[23:0] (first scanline starts at this pixel address)
// WRAP (5): wrap_pixel_address[23:0] (restart at pixel address 0 when reached)
// READREGID (6): mapped_regid[2:0] (the register mapped for read access)
//
// Commands:
// SET_PALETTE_R (1) arg12_1: cmap entry arg12_2: R
// SET_PALETTE_G (2) arg12_1: cmap entry arg12_2: G
// SET_PALETTE_B (3) arg12_1: cmap entry arg12_2: B
// SET_WWINDOW_X (4) arg12_1: x1 arg12_2: x2
// SET_WWINDOW_Y (5) arg12_1: y1 arg12_2: y2
// FILLRECT (6) arg24: color
//
// The window [x1-x2] [y1-y2] can be used in two different ways:
// - FILLRECT fills it with the specified color. Operation is
// complete when membusy goes low in REG_STATUS.
// - individual pixel values can be specified one by one by
// writing to the DAT mapped IO (io_wstrb + sel_dat), pixel
// address is incremented automatically.
// This allows emulation of SSD1331/SSD1351 "window write"
// command in the three modes for OLED-HDMI mirroring
//
// See FIRMWARE/LIBFEMTOGL/FGA.h, FGA.c and FGA_mode.c
// "Physical mode" sent to the HDMI (choose one of them)
// Note: > 640x480 may make timings fail
//`define MODE_640x480
`define MODE_800x600
//`define MODE_1024x768
//`define MODE_1280x1024
`include "GFX_hdmi.v"
module FGA(
input wire pclk, // board clock
input wire clk, // system clock
input wire sel, // if zero, writes are ignored
input wire [3:0] mem_wmask, // mem write mask and strobe
input wire [16:0] mem_address, // address in graphic memory (128K), word-aligned
input wire [31:0] mem_wdata, // data to be written
output wire [3:0] gpdi_dp, // HDMI signals, blue, green, red, clock
// dgpi_dn generated by pins (see ulx3s.lpf)
input wire io_wstrb,
input wire io_rstrb,
input wire sel_cntl, // IO: select control register (RW)
input wire sel_dat, // IO: select data input (W)
output wire [31:0] rdata // data read
);
`include "GFX_modes.v"
wire pixel_clk;
reg [31:0] VRAM[0:32767];
reg [23:0] PALETTE[0:255];
/************************* HDMI signal generation ***************************/
// Video mode parameters
localparam MODE_1bpp = 3'd0;
localparam MODE_2bpp = 3'd1;
localparam MODE_4bpp = 3'd2;
localparam MODE_8bpp = 3'd3;
localparam MODE_16bpp = 3'd4;
reg [11:0] mode_width;
reg [11:0] mode_height;
reg [2:0] mode_bpp; // see MODE_xbpp constants
reg mode_colormapped;
reg mode_magnify; // asserted for pixel doubling
reg [23:0] mode_origin_pix_address;
reg [23:0] mode_wrap_pix_address;
// This part is just like a VGA generator.
reg [11:0] X, Y; // current pixel coordinates
reg hsync, vsync; // horizontal and vertical synchronization
reg draw_area; // asserted if current pixel is in drawing area
reg mem_busy; // asserted if memory transfer is running.
// Data read from control register
reg [31:0] read_reg;
assign rdata = (io_rstrb && sel_cntl) ? read_reg : 32'b0;
// We are going to fetch data from video RAM (now stored in BRAM), and then,
// in colormapped modes, fetch colormap entry. Each fetch introduces some
// latency -> there is a small pixel pipeline. Each stage needs to have
// its own copy of all registers it needs (that is, copy pixel address
// between stage 1 and stage 2 to keep it in sync with pixel data).
//
// Stage 0 generates the X,Y coordinates and horizontal,vertical sync signals
// (standard in all VGA/DVI/HDMI drivers)
// Stage 1 generates the pixel address. The unit is in number of pixels.
// it handles pixel doubling/scanline doubling in 320x200 resolutions
// it also handles page flipping, with the ORIGIN register.
// Stage 2 fetches pixel data from RAM. It handles pixel address -> word address
// translation. It creates its own copy of pixel_address to keep it in
// sync with pixel data (1 clock latency)
// Stage 3 generates R,G,B either from colormap lookup (mode 1 and 2) or from
// 16 bit pixel data directly (mode 0). If colormap lookup is used,
// it generates an additional cycle of latency.
//
// Note: the first two pixel columns are wrong due to latency (the image is
// shifted two pixels to the right, with garbage in the first two columns),
// normally we should start fetching from the previous scanline, at the end
// of hsync, 1 clock in advance in mode 0, and two clocks in advance in mode 1.
// I was too lazy to do that, so I just hide the first two columns !
// (so there are two columns missing on the right side of the image).
// I will do that properly when VRAM will be stored in SDRAM (then I'll have no
// choice, latency will probably be significantly larger than 2 pixels).
// Stage 0: X,Y,vsync,hsync generation
always @(posedge pixel_clk) begin
if(X == GFX_line_width-1) begin
X <= 0;
Y <= (Y == GFX_lines-1) ? 0 : Y+1;
end else begin
X <= X+1;
end
hsync <= (X>=GFX_width+GFX_h_front_porch) &&
(X<GFX_width+GFX_h_front_porch+GFX_h_sync_width);
vsync <= (Y>=GFX_height+GFX_v_front_porch) &&
(Y<GFX_height+GFX_v_front_porch+GFX_v_sync_width);
draw_area <= (X<GFX_width) && (Y<GFX_height);
end
// Stage 1: pixel address generation
reg [23:0] pix_address;
reg [23:0] row_start_pix_address;
wire [23:0] next_row_start_pix_address =
((row_start_pix_address + {12'b0, mode_width}) <= mode_wrap_pix_address) ?
row_start_pix_address + {12'b0, mode_width} : 0 ;
// Generate pixel address based on scanning coordinates (X,Y) and
// magnify mode (that doubles the rows and doubles the pixels in
// the rows).
always @(posedge pixel_clk) begin
if(X == 0) begin
if(Y == 0) begin
row_start_pix_address <= mode_origin_pix_address;
pix_address <= mode_origin_pix_address;
end else begin
// Increment row address every 2 Y (2 because magnify)
if(Y[0] || !mode_magnify) begin
row_start_pix_address <= next_row_start_pix_address;
pix_address <= next_row_start_pix_address;
end else begin
pix_address <= row_start_pix_address;
end
end
end else begin
if(X[0] || !mode_magnify) pix_address <= pix_address + 1;
end
end
// Stage 2: pixel data fetch
reg [23:0] word_address;
always @(*) begin
case(mode_bpp)
MODE_16bpp: word_address = pix_address >> 1;
MODE_8bpp: word_address = pix_address >> 2;
MODE_4bpp: word_address = pix_address >> 3;
MODE_2bpp: word_address = pix_address >> 4;
MODE_1bpp: word_address = pix_address >> 5;
default: word_address = 0;
endcase
end
reg [23:0] pix_address_2;
reg [31:0] pix_word_data_2;
always @(posedge pixel_clk) begin
pix_address_2 <= pix_address;
pix_word_data_2 <= VRAM[word_address[14:0]]; // TODO
end
// Stage 3: generate R,G,B from pixel data
// combinatorial circuit to extract index from
// pixel data.
reg [7:0] pix_color_index_3;
/* verilator lint_off WIDTH */
always @(*) begin
case(mode_bpp)
MODE_8bpp: begin
pix_color_index_3 = pix_word_data_2 >> {pix_address_2[1:0], 3'b0};
end
MODE_4bpp: begin
pix_color_index_3[3:0] = pix_word_data_2 >> {pix_address_2[2:0], 2'b0};
pix_color_index_3[7:4] = 4'b0;
end
MODE_2bpp: begin
pix_color_index_3[1:0] = pix_word_data_2 >> {pix_address_2[3:0], 1'b0};
pix_color_index_3[7:2] = 6'b0;
end
MODE_1bpp: begin
pix_color_index_3[0] = pix_word_data_2 >> pix_address_2[4:0];
pix_color_index_3[7:1] = 7'b0;
end
default: begin
pix_color_index_3 = 0;
end
endcase
end
/* verilator lint_on WIDTH */
reg [11:0] maxX;
reg [11:0] maxY;
always @(posedge clk) begin
maxX <= mode_magnify ? (mode_width << 1) : mode_width;
maxY <= mode_magnify ? (mode_height << 1) : mode_height;
end
reg [7:0] R,G,B;
always @(posedge pixel_clk) begin
if(mode_colormapped) begin
{R,G,B} <= PALETTE[pix_color_index_3];
end else begin
if(pix_address_2[0]) begin
R <= {pix_word_data_2[31:27],3'b000};
G <= {pix_word_data_2[26:21],2'b00 };
B <= {pix_word_data_2[20:16],3'b000};
end else begin
R <= {pix_word_data_2[15:11],3'b000};
G <= {pix_word_data_2[10:5 ],2'b00 };
B <= {pix_word_data_2[ 4:0 ],3'b000};
end
end
// Hide what's outside the display zone.
// Hide the first two columns (I was too lazy to properly handle my
// pixel pipeline latency).
if(X == 0 || X == 1 || X >= maxX || Y >= maxY) {R,G,B} <= 24'b0;
end
// Video signal generation and HDMI
wire pixel_clk_x5; // The pixel_clk*5 freq clock used by the serializers (DDR)
// The graphic PLL, that generates the pixel clock (and freq*5 clock)
GFX_PLL gfx_pll(
.pclk(pclk),
.pixel_clk(pixel_clk),
.pixel_clk_x5(pixel_clk_x5)
);
// The HDMI encoder
GFX_hdmi hdmi(
.pixel_clk(pixel_clk), .pixel_clk_x5(pixel_clk_x5),
.R(R), .G(G), .B(B), .hsync(hsync), .vsync(vsync), .draw_area(draw_area),
.gpdi_dp(gpdi_dp)
);
/*************************************************************************/
wire is_command = mem_wdata[7];
wire [2:0] command = mem_wdata[2:0];
wire [2:0] set_regid = mem_wdata[2:0];
wire[23:0] arg24 = mem_wdata[31:8];
wire[11:0] arg12_1 = mem_wdata[19:8];
wire[11:0] arg12_2 = mem_wdata[31:20];
localparam REG_STATUS = 3'd0;
localparam REG_RESOLUTION = 3'd1;
localparam REG_COLORMODE = 3'd2;
localparam REG_DISPLAYMODE = 3'd3;
localparam REG_ORIGIN = 3'd4;
localparam REG_WRAP = 3'd5;
localparam REG_READREGID = 3'd6;
localparam CMD_SET_PALETTE_R = 3'd1;
localparam CMD_SET_PALETTE_G = 3'd2;
localparam CMD_SET_PALETTE_B = 3'd3;
localparam CMD_SET_WWINDOW_X = 3'd4;
localparam CMD_SET_WWINDOW_Y = 3'd5;
localparam CMD_FILLRECT = 3'd6;
// Windowed-pixel write and fillrect command.
//
// - write window command, two commands:
// (send 32 bits to IO_FGA_CNTL hardware register)
// SET_WWINDOW_X: X1 X2
// SET_WWINDOW_Y: Y1 Y2
//
// - write data: send 16 bits to IO_FGA_DAT hardware register
// MSB first, encoding follows SSD1351: RRRRR GGGGG 0 BBBBB
//
// Note that once the window is properly initialized, the write
// data command emulates the SSD1351 OLED display, then by writing
// to both FGA and SSD1351 control registers, one clones the output
// of the SSD1351 oled display to the HDMI screen for free !
//
// See in <femtorv32.h>:
// #define IO_GFX_DAT (IO_SSD1351_DAT16 | IO_FGA_DAT)
// #define OLED_WRITE_DATA_UINT16(RGB) IO_OUT(IO_GFX_DAT,(RGB))
// #define OLED_WRITE_DATA_RGB(R,G,B) OLED_WRITE_DATA_UINT16(GL_RGB(R,G,B))
//
// This also works when FGA is in paletted mode (320x200x8bpp, 640x400x4bpp)
// since the write data command properly interprets pixel addresses. The
// only requirement is to have a palette that will correctly map the 8 LSBs
// / 4 LSBs of pixel data to a color. In libfemtorv32, this maps 0 to black
// and any non-zero to white (this is how COMMANDER is displayed in 640x400
// on the HDMI screen).
//
// To generate pixel data, there are two other options:
// - directly writing to VRAM from FemtoRV32
// - FILLRECT (see below)
reg [11:0] window_x1, window_x2, window_y1, window_y2, window_x, window_y;
reg [23:0] window_row_start;
reg [23:0] window_pixel_address;
reg [15:0] fill_color;
reg fill_rect;
// Data read from control register: depends on mapped register (read_regid)
reg [2:0] read_regid;
always @(posedge clk) begin
case(read_regid)
REG_RESOLUTION: read_reg <= {8'b0, mode_height, mode_width};
REG_COLORMODE: read_reg <= {28'b0, mode_colormapped, mode_bpp};
REG_DISPLAYMODE: read_reg <= {31'b0, mode_magnify};
REG_ORIGIN: read_reg <= {8'b0, mode_origin_pix_address};
REG_WRAP: read_reg <= {8'b0, mode_wrap_pix_address};
REG_READREGID: read_reg <= {29'b0, read_regid};
default: read_reg <= {(Y >= 400),(X >= 640),draw_area,mem_busy,4'b0,X,Y};
endcase
end
always @(posedge clk) begin
if(mem_busy && ((io_wstrb && sel_dat) || fill_rect)) begin
window_pixel_address <= window_pixel_address + 1;
window_x <= window_x + 1;
if(window_x == window_x2) begin
if(window_y == window_y2) begin
mem_busy <= 1'b0;
fill_rect <= 1'b0;
end else begin
window_y <= window_y+1;
window_x <= window_x1;
window_pixel_address <= window_row_start + {12'b0, mode_width};
window_row_start <= window_row_start + {12'b0, mode_width};
end
end
end
if(io_wstrb && sel_cntl) begin
if(is_command) begin
case(command)
CMD_SET_PALETTE_B: PALETTE[arg12_1[7:0]][7:0 ] <= arg12_2[7:0];
CMD_SET_PALETTE_G: PALETTE[arg12_1[7:0]][15:8] <= arg12_2[7:0];
CMD_SET_PALETTE_R: PALETTE[arg12_1[7:0]][23:16] <= arg12_2[7:0];
CMD_SET_WWINDOW_X: begin
window_x1 <= arg12_1;
window_x2 <= arg12_2;
window_x <= arg12_1;
mem_busy <= 1'b1;
end
CMD_SET_WWINDOW_Y: begin
window_y1 <= arg12_1;
window_y2 <= arg12_2;
window_y <= arg12_1;
mem_busy <= 1'b1;
/* verilator lint_off WIDTH */
window_row_start <= arg12_1 * mode_width + window_x1;
window_pixel_address <= arg12_1 * mode_width + window_x1;
/* verilator lint_on WIDTH */
end
CMD_FILLRECT: begin
fill_rect <= 1'b1;
fill_color <= arg24[15:0];
end
default: begin end
endcase
end else begin
case(set_regid)
REG_RESOLUTION: {mode_height, mode_width} <= arg24;
REG_COLORMODE: {mode_colormapped, mode_bpp} <= arg24[3:0];
REG_DISPLAYMODE: mode_magnify <= arg24[0];
REG_READREGID: read_regid <= arg24[2:0];
REG_ORIGIN: mode_origin_pix_address <= arg24;
REG_WRAP: mode_wrap_pix_address <= arg24;
default: begin end
endcase
end
end
end
// Write to VRAM (FILLRECT and interface with processor)
wire [14:0] vram_word_address = mem_address[16:2];
wire [15:0] pixel_color = fill_rect ? fill_color : mem_wdata[15:0];
// FILLRECT:
// The fillrect command repeatedly sends the same pixel data to the current
// window. It has two advantages as compared to do that by hand:
// - fills one pixel per clock (whereas in its fastest configuration,
// FemtoRV32 uses 6 clocks per loop iteration)
// - execution can continue, which lets FemtoRV prepare the next drawing
// operation. Before sending more data to FGA, FemtoRV needs to test
// the FGA_BUSY_bit in the control register, as follows:
// while(IO_IN(IO_FGA_CNTL) & FGA_BUSY_bit);
// This is used in LIBFEMTORV32/FGA.c, to implement hardware-accelerated
// polygon fill (using one FILLRECT call per polygon scanline).
always @(posedge clk) begin
// FILLRECT or pixel data sent to the graphic data port
if(fill_rect || (io_wstrb && sel_dat && mem_busy)) begin
/* verilator lint_off CASEINCOMPLETE */
case(mode_bpp)
MODE_16bpp: begin
case(window_pixel_address[0])
1'b0: VRAM[window_pixel_address[15:1]][15:0 ] <= pixel_color;
1'b1: VRAM[window_pixel_address[15:1]][31:16] <= pixel_color;
endcase
end
MODE_8bpp: begin
case(window_pixel_address[1:0])
2'b00: VRAM[window_pixel_address[16:2]][ 7:0 ] <= pixel_color[7:0];
2'b01: VRAM[window_pixel_address[16:2]][15:8 ] <= pixel_color[7:0];
2'b10: VRAM[window_pixel_address[16:2]][23:16] <= pixel_color[7:0];
2'b11: VRAM[window_pixel_address[16:2]][31:24] <= pixel_color[7:0];
endcase
end
MODE_4bpp: begin
case(window_pixel_address[2:0])
3'b000: VRAM[window_pixel_address[17:3]][ 3:0 ] <= pixel_color[3:0];
3'b001: VRAM[window_pixel_address[17:3]][ 7:4 ] <= pixel_color[3:0];
3'b010: VRAM[window_pixel_address[17:3]][11:8 ] <= pixel_color[3:0];
3'b011: VRAM[window_pixel_address[17:3]][15:12] <= pixel_color[3:0];
3'b100: VRAM[window_pixel_address[17:3]][19:16] <= pixel_color[3:0];
3'b101: VRAM[window_pixel_address[17:3]][23:20] <= pixel_color[3:0];
3'b110: VRAM[window_pixel_address[17:3]][27:24] <= pixel_color[3:0];
3'b111: VRAM[window_pixel_address[17:3]][31:28] <= pixel_color[3:0];
endcase
end
MODE_2bpp: begin
case(window_pixel_address[3:0])
4'b0000: VRAM[window_pixel_address[18:4]][ 1:0 ] <= pixel_color[1:0];
4'b0001: VRAM[window_pixel_address[18:4]][ 3:2 ] <= pixel_color[1:0];
4'b0010: VRAM[window_pixel_address[18:4]][ 5:4 ] <= pixel_color[1:0];
4'b0011: VRAM[window_pixel_address[18:4]][ 7:6 ] <= pixel_color[1:0];
4'b0100: VRAM[window_pixel_address[18:4]][ 9:8 ] <= pixel_color[1:0];
4'b0101: VRAM[window_pixel_address[18:4]][11:10] <= pixel_color[1:0];
4'b0110: VRAM[window_pixel_address[18:4]][13:12] <= pixel_color[1:0];
4'b0111: VRAM[window_pixel_address[18:4]][15:14] <= pixel_color[1:0];
4'b1000: VRAM[window_pixel_address[18:4]][17:16] <= pixel_color[1:0];
4'b1001: VRAM[window_pixel_address[18:4]][19:18] <= pixel_color[1:0];
4'b1010: VRAM[window_pixel_address[18:4]][21:20] <= pixel_color[1:0];
4'b1011: VRAM[window_pixel_address[18:4]][23:22] <= pixel_color[1:0];
4'b1100: VRAM[window_pixel_address[18:4]][25:24] <= pixel_color[1:0];
4'b1101: VRAM[window_pixel_address[18:4]][27:26] <= pixel_color[1:0];
4'b1110: VRAM[window_pixel_address[18:4]][29:28] <= pixel_color[1:0];
4'b1111: VRAM[window_pixel_address[18:4]][31:30] <= pixel_color[1:0];
endcase
end
default: begin // 1bpp
VRAM[window_pixel_address[19:5]][window_pixel_address[4:0]] <= pixel_color[0];
end
endcase
/* verilator lint_on CASEINCOMPLETE */
end else if(sel && !mem_busy) begin // Direct VRAM write from FemtoRV32
if(mem_wmask[0]) VRAM[vram_word_address][ 7:0 ] <= mem_wdata[ 7:0 ];
if(mem_wmask[1]) VRAM[vram_word_address][15:8 ] <= mem_wdata[15:8 ];
if(mem_wmask[2]) VRAM[vram_word_address][23:16] <= mem_wdata[23:16];
if(mem_wmask[3]) VRAM[vram_word_address][31:24] <= mem_wdata[31:24];
end
end
endmodule

154
RTL/DEVICES/GFX_hdmi.v Normal file
View File

@@ -0,0 +1,154 @@
// Define one of:
// MODE_640x480, MODE_800x600, MODE_1024x768, MODE_1280x1024.
// ("physical mode" sent to the HDMI)
`include "TMDS_encoder.v"
// Generate HDMI signal from VGA signal
module GFX_hdmi(
input wire pixel_clk, // pixel clock
input wire pixel_clk_x5, // 5 times pixel clock freq (used by TMDS serializer)
// The TMDS serializers operate at (pixel_clock_freq * 10),
// but we use DDR mode, hence (pixel_clock_freq * 5).
input wire [7:0] R,
input wire [7:0] G,
input wire [7:0] B,
input wire hsync,
input wire vsync,
input wire draw_area,
output wire [3:0] gpdi_dp // HDMI signals, blue, green, red, clock
// dgpi_dn generated by pins (see, e.g., ulx3s.lpf)
);
// RGB TMDS encoding
// Generate 10-bits TMDS red,green,blue signals. Blue embeds HSync/VSync in its
// control part.
wire [9:0] TMDS_R, TMDS_G, TMDS_B;
TMDS_encoder encode_R(.clk(pixel_clk), .VD(R), .CD(2'b00) , .VDE(draw_area), .TMDS(TMDS_R));
TMDS_encoder encode_G(.clk(pixel_clk), .VD(G), .CD(2'b00) , .VDE(draw_area), .TMDS(TMDS_G));
TMDS_encoder encode_B(.clk(pixel_clk), .VD(B), .CD({vsync,hsync}), .VDE(draw_area), .TMDS(TMDS_B));
// Modulo-5 clock divider.
reg [4:0] TMDS_mod5=1;
wire TMDS_shift_load = TMDS_mod5[4];
always @(posedge pixel_clk_x5) TMDS_mod5 <= {TMDS_mod5[3:0],TMDS_mod5[4]};
// Shifters
// Every 5 clocks, we get a fresh R,G,B triplet from the TMDS encoders,
// else we shift.
reg [9:0] TMDS_shift_R=0, TMDS_shift_G=0, TMDS_shift_B=0;
always @(posedge pixel_clk_x5) begin
TMDS_shift_R <= TMDS_shift_load ? TMDS_R : {2'b00,TMDS_shift_R[9:2]};
TMDS_shift_G <= TMDS_shift_load ? TMDS_G : {2'b00,TMDS_shift_G[9:2]};
TMDS_shift_B <= TMDS_shift_load ? TMDS_B : {2'b00,TMDS_shift_B[9:2]};
end
// DDR serializers: they send D0 at the rising edge and D1 at the falling edge.
`ifndef BENCH_OR_LINT
`ifdef ULX3S
ODDRX1F ddr_R (.D0(TMDS_shift_R[0]), .D1(TMDS_shift_R[1]), .Q(gpdi_dp[2]), .SCLK(pixel_clk_x5), .RST(1'b0));
ODDRX1F ddr_G (.D0(TMDS_shift_G[0]), .D1(TMDS_shift_G[1]), .Q(gpdi_dp[1]), .SCLK(pixel_clk_x5), .RST(1'b0));
ODDRX1F ddr_B (.D0(TMDS_shift_B[0]), .D1(TMDS_shift_B[1]), .Q(gpdi_dp[0]), .SCLK(pixel_clk_x5), .RST(1'b0));
`endif
`endif
// The pixel clock is sent through the fourth differential pair.
assign gpdi_dp[3] = pixel_clk;
endmodule
/**************************************************************************************/
`ifdef BENCH_OR_LINT
module GFX_PLL(
input wire pclk, // the board's clock
output wire pixel_clk, // pixel clock
output wire pixel_clk_x5 // 5 times pixel clock freq (used by TMDS serializer)
);
assign pixel_clk = pclk;
assign pixel_clk_x5 = pclk;
endmodule
`else
`ifdef ULX3S
module GFX_PLL(
input wire pclk, // the board's clock
output wire pixel_clk, // pixel clock
output wire pixel_clk_x5 // 5 times pixel clock freq (used by TMDS serializer)
// The TMDS serializers operate at (pixel_clock_freq * 10),
// but we use DDR mode, hence (pixel_clock_freq * 5).
);
// The parameters of the PLL,
// They are found by using: ecppll -i 25 -o <5*pixel_clock> -f foobar.v
`ifdef MODE_640x480
localparam CLKI_DIV = 1;
localparam CLKOP_DIV = 5;
localparam CLKOP_CPHASE = 2;
localparam CLKOP_FPHASE = 0;
localparam CLKFB_DIV = 5;
`endif
`ifdef MODE_800x600
localparam CLKI_DIV = 1;
localparam CLKOP_DIV = 3;
localparam CLKOP_CPHASE = 1;
localparam CLKOP_FPHASE = 0;
localparam CLKFB_DIV = 8;
`endif
`ifdef MODE_1024x768
localparam CLKI_DIV = 1;
localparam CLKOP_DIV = 2;
localparam CLKOP_CPHASE = 1;
localparam CLKOP_FPHASE = 0;
localparam CLKFB_DIV = 13;
`endif
`ifdef MODE_1280x1024
localparam CLKI_DIV = 3;
localparam CLKOP_DIV = 1;
localparam CLKOP_CPHASE = 0;
localparam CLKOP_FPHASE = 0;
localparam CLKFB_DIV = 65;
`endif
// The PLL converts a 25 MHz clock into a (pixel_clock_freq * 5) clock
// The (half) TMDS serializer clock is generated on pin CLKOP.
// In addition, the pixel clock (at TMDS freq/5) is generated on
// pin CLKOS (hence CLKOS_DIV = 5*CLKOP_DIV).
(* ICP_CURRENT="12" *) (* LPF_RESISTOR="8" *) (* MFG_ENABLE_FILTEROPAMP="1" *) (* MFG_GMCREF_SEL="2" *)
EHXPLLL #(
.CLKI_DIV(CLKI_DIV),
.CLKOP_DIV(CLKOP_DIV),
.CLKOP_CPHASE(CLKOP_CPHASE),
.CLKOP_FPHASE(CLKOP_FPHASE),
.CLKOS_ENABLE("ENABLED"),
.CLKOS_DIV(5*CLKOP_DIV),
.CLKOS_CPHASE(CLKOP_CPHASE),
.CLKOS_FPHASE(CLKOP_FPHASE),
.CLKFB_DIV(CLKFB_DIV)
) pll_i (
.CLKI(pclk),
.CLKOP(pixel_clk_x5),
.CLKFB(pixel_clk_x5),
.CLKOS(pixel_clk),
.PHASESEL0(1'b0),
.PHASESEL1(1'b0),
.PHASEDIR(1'b1),
.PHASESTEP(1'b1),
.PHASELOADREG(1'b1),
.PLLWAKESYNC(1'b0),
.ENCLKOP(1'b0)
);
endmodule
`endif
`endif

57
RTL/DEVICES/GFX_modes.v Normal file
View File

@@ -0,0 +1,57 @@
// Define one of:
// MODE_640x480, MODE_800x600, MODE_1024x768, MODE_1280x1024.
/********************** Modes ****************************/
`ifdef MODE_640x480
localparam GFX_pixel_clock = 25;
localparam GFX_width = 640;
localparam GFX_height = 480;
localparam GFX_h_front_porch = 16;
localparam GFX_h_sync_width = 96;
localparam GFX_h_back_porch = 48;
localparam GFX_v_front_porch = 10;
localparam GFX_v_sync_width = 2;
localparam GFX_v_back_porch = 32;
`endif
`ifdef MODE_800x600
localparam GFX_pixel_clock = 40;
localparam GFX_width = 800;
localparam GFX_height = 600;
localparam GFX_h_front_porch = 40;
localparam GFX_h_sync_width = 128;
localparam GFX_h_back_porch = 88;
localparam GFX_v_front_porch = 1;
localparam GFX_v_sync_width = 4;
localparam GFX_v_back_porch = 23;
`endif
`ifdef MODE_1024x768
localparam GFX_pixel_clock = 65;
localparam GFX_width = 1024;
localparam GFX_height = 768;
localparam GFX_h_front_porch = 24;
localparam GFX_h_sync_width = 136;
localparam GFX_h_back_porch = 160;
localparam GFX_v_front_porch = 3;
localparam GFX_v_sync_width = 6;
localparam GFX_v_back_porch = 29;
`endif
`ifdef MODE_1280x1024
localparam GFX_pixel_clock = 108;
localparam GFX_width = 1280;
localparam GFX_height = 1024;
localparam GFX_h_front_porch = 48;
localparam GFX_h_sync_width = 112;
localparam GFX_h_back_porch = 248;
localparam GFX_v_front_porch = 1;
localparam GFX_v_sync_width = 3;
localparam GFX_v_back_porch = 38;
`endif
localparam GFX_line_width = GFX_width + GFX_h_front_porch + GFX_h_sync_width + GFX_h_back_porch;
localparam GFX_lines = GFX_height + GFX_v_front_porch + GFX_v_sync_width + GFX_v_back_porch;

View File

@@ -0,0 +1,59 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: memory-mapped constants to query
// hardware config.
module HardwareConfig(
input wire clk,
input wire sel_memory, // available RAM
input wire sel_devices, // configured devices
input wire sel_cpuinfo, // CPU information
output wire [31:0] rdata // read data
);
`include "HardwareConfig_bits.v"
`ifdef NRV_COUNTER_WIDTH
localparam counter_width = `NRV_COUNTER_WIDTH;
`else
localparam counter_width = 32;
`endif
// configured devices
localparam NRV_DEVICES = 0
`ifdef NRV_IO_LEDS
| (1 << IO_LEDS_bit)
`endif
`ifdef NRV_IO_UART
| (1 << IO_UART_DAT_bit) | (1 << IO_UART_CNTL_bit)
`endif
`ifdef NRV_IO_SSD1351_1331
| (1 << IO_SSD1351_CNTL_bit) | (1 << IO_SSD1351_CMD_bit) | (1 << IO_SSD1351_DAT_bit)
`endif
`ifdef NRV_IO_MAX7219
| (1 << IO_MAX7219_DAT_bit)
`endif
`ifdef NRV_IO_SPI_FLASH
| (1 << IO_SPI_FLASH_bit)
`endif
`ifdef NRV_MAPPED_SPI_FLASH
| (1 << IO_MAPPED_SPI_FLASH_bit)
`endif
`ifdef NRV_IO_SDCARD
| (1 << IO_SDCARD_bit)
`endif
`ifdef NRV_IO_BUTTONS
| (1 << IO_BUTTONS_bit)
`endif
`ifdef NRV_IO_FGA
| (1 << IO_FGA_CNTL_bit) | (1 << IO_FGA_DAT_bit)
`endif
;
assign rdata = sel_memory ? `NRV_RAM :
sel_devices ? NRV_DEVICES :
sel_cpuinfo ? (`NRV_FREQ << 16) | counter_width : 32'b0;
endmodule

View File

@@ -0,0 +1,24 @@
// We got a total of 20 bits for 1-hot addressing of IO registers.
localparam IO_LEDS_bit = 0; // RW four leds
localparam IO_UART_DAT_bit = 1; // RW write: data to send (8 bits) read: received data (8 bits)
localparam IO_UART_CNTL_bit = 2; // R status. bit 8: valid read data. bit 9: busy sending
localparam IO_SSD1351_CNTL_bit = 3; // W Oled display control
localparam IO_SSD1351_CMD_bit = 4; // W Oled display commands (8 bits)
localparam IO_SSD1351_DAT_bit = 5; // W Oled display data (8 bits)
localparam IO_SSD1351_DAT16_bit = 6; // W Oled display data (16 bits)
localparam IO_MAX7219_DAT_bit = 7; // W led matrix data (16 bits)
localparam IO_SDCARD_bit = 8; // RW write: bit 0: mosi bit 1: clk bit 2: csn read: miso
localparam IO_BUTTONS_bit = 9; // R buttons state
localparam IO_FGA_CNTL_bit = 10; // RW write: send command read: get VSync/HSync/MemBusy/X/Y state
localparam IO_FGA_DAT_bit = 11; // W write: write pixel data
// The three constant hardware config registers, using the three last bits of IO address space
localparam IO_HW_CONFIG_RAM_bit = 17; // R total quantity of RAM, in bytes
localparam IO_HW_CONFIG_DEVICES_bit = 18; // R configured devices
localparam IO_HW_CONFIG_CPUINFO_bit = 19; // R CPU information CPL(6) FREQ(10) RESERVED(16)
// These devices do not have hardware registers. Just a bit set in IO_HW_CONFIG_DEVICES
localparam IO_MAPPED_SPI_FLASH_bit = 20; // no register (just there to indicate presence)

53
RTL/DEVICES/LEDs.v Normal file
View File

@@ -0,0 +1,53 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: driver for LEDs (does nearly nothing !)
//
module LEDDriver(
`ifdef NRV_IO_IRDA
output wire irda_TXD,
input wire irda_RXD,
output wire irda_SD,
`endif
input wire clk, // system clock
input wire rstrb, // read strobe
input wire wstrb, // write strobe
input wire sel, // select (read/write ignored if low)
input wire [31:0] wdata, // data to be written
output wire [31:0] rdata, // read data
output wire [3:0] LED // LED pins
);
// The IceStick has an infrared reveiver/transmitter pair
// See EXAMPLES/test_ir_sensor.c and EXAMPLES/test_ir_remote.c
`ifdef NRV_IO_IRDA
reg [5:0] led_state;
assign LED = led_state[3:0];
assign rdata = (sel ? {25'b0, irda_RXD, led_state} : 32'b0);
assign irda_SD = led_state[5];
assign irda_TXD = led_state[4];
`else
reg [3:0] led_state;
assign LED = led_state;
initial begin
led_state = 4'b0000;
end
assign rdata = (sel ? {28'b0, led_state} : 32'b0);
`endif
always @(posedge clk) begin
if(sel && wstrb) begin
`ifdef NRV_IO_IRDA
led_state <= wdata[5:0];
`else
led_state <= wdata[3:0];
`endif
`ifdef BENCH
$display("****************** LEDs = %b", wdata[3:0]);
`endif
end
end
endmodule

51
RTL/DEVICES/MAX7219.v Normal file
View File

@@ -0,0 +1,51 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: driver for MAX7219 led matrix display
module MAX7219(
input wire clk, // system clock
input wire wstrb, // write strobe
input wire sel, // write ignored if low
input wire [31:0] wdata, // data to be written
output wire wbusy, // asserted if the driver is busy sending data
// MAX7219 pins
output wire DIN, // data in
output wire CLK, // clock
output wire CS // chip select
);
reg [2:0] divider;
always @(posedge clk) begin
divider <= divider + 1;
end
// clk=60MHz, slow_clk=60/8 MHz (max = 10 MHz)
wire slow_clk = (divider == 3'b000);
reg[4:0] bitcount; // 0 means idle
initial bitcount = 0;
reg[15:0] shifter;
assign DIN = shifter[15];
wire sending = |bitcount;
assign wbusy = sending;
assign CS = !sending;
assign CLK = sending && slow_clk;
always @(posedge clk) begin
if(wstrb) begin
if(sel) begin
shifter <= wdata[15:0];
bitcount <= 16;
end
end else begin
if(sending && slow_clk) begin
bitcount <= bitcount - 5'd1;
shifter <= {shifter[14:0], 1'b0};
end
end
end
endmodule

View File

@@ -0,0 +1,393 @@
// femtorv32, a minimalistic RISC-V RV32I core
// (minus SYSTEM and FENCE that are not implemented)
//
// Bruno Levy, 2020-2021
// Matthias Koch, 2021
//
// This file: driver for SPI Flash, projected in memory space (readonly)
//
// TODO: go faster with XIP mode and dummy cycles customization
// - send write enable command (06h)
// - send write volatile config register command (08h REG)
// REG=dummy_cycles[7:4]=4'b0100 XIP[3]=1'b1 reserved[2]=1'b0 wrap[1:0]=2'b11
// (4 dummy cycles, works at up to 90 MHz according to datasheet)
//
// DataSheets:
// https://media-www.micron.com/-/media/client/global/documents/products/data-sheet/nor-flash/serial-nor/n25q/n25q_32mb_3v_65nm.pdf?rev=27fc6016fc5249adb4bb8f221e72b395
// https://www.winbond.com/resource-files/w25q128jv%20spi%20revc%2011162016.pdf (not the same chip, mostly compatible, datasheet is easier to read)
// The one on the ULX3S: https://www.issi.com/WW/pdf/25LP-WP128F.pdf
// this one supports quad-SPI mode, IO0=SI, IO1=SO, IO2=WP, IO3=Hold/Reset
// There are four versions (from slowest to fastest)
//
// Version (used command) | cycles per 32-bits read | Specificity |
// ----------------------------------------------------------|-----------------------|
// SPI_FLASH_READ | 64 slow (50 MHz) | Standard |
// SPI_FLASH_FAST_READ | 72 fast (100 MHz) | Uses dummy cycles |
// SPI_FLASH_FAST_READ_DUAL_OUTPUT | 56 fast | Reverts MOSI |
// SPI_FLASH_FAST_READ_DUAL_IO | 44 fast | Reverts MISO and MOSI |
// One can go even faster by configuring number of dummy cycles (can save up to 4 cycles per read)
// and/or using XIP mode (that just requires the address to be sent, saves 16 cycles per 32-bits read)
// (I tried both without success). This may require another mechanism to change configuration register.
//
// Most chips support a QUAD IO mode, using four bidirectional pins,
// however, is not possible because the IO2 and IO3 pins
// are not wired on the IceStick (one may solder a tiny wire and plug it
// to a GPIO pin but I haven't soldering skills for things of that size !!)
// It is a pity, because one could go really fast with these pins !
// Macros to select version and number of dummy cycles based on the board.
`ifdef ICE_STICK
`define SPI_FLASH_FAST_READ_DUAL_IO
`define SPI_FLASH_CONFIGURED
`endif
`ifdef ICE4PI
`undef SPI_FLASH_FAST_READ_DUAL_IO
`undef SPI_FLASH_CONFIGURED
`endif
`ifdef ICE_BREAKER
`define SPI_FLASH_FAST_READ_DUAL_IO
`define SPI_FLASH_DUMMY_CLOCKS 4 // Winbond SPI chips on icebreaker uses 4 dummy clocks
`define SPI_FLASH_CONFIGURED
`endif
`ifdef ULX3S
`define SPI_FLASH_FAST_READ // TODO check whether dual IO mode can be done / dummy clocks
`define SPI_FLASH_CONFIGURED
`endif
`ifdef ARTY
`define SPI_FLASH_READ
`define SPI_FLASH_CONFIGURED
`endif
`ifdef ICE_SUGAR_NANO
`define SPI_FLASH_READ
`define SPI_FLASH_CONFIGURED
`endif
`ifndef SPI_FLASH_DUMMY_CLOCKS
`define SPI_FLASH_DUMMY_CLOCKS 8
`endif
`ifndef SPI_FLASH_CONFIGURED // Default: using slowest / simplest mode (command $03)
`define SPI_FLASH_READ
`endif
/********************************************************************************************************************************/
`ifdef SPI_FLASH_READ
module MappedSPIFlash(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire [19:0] word_address, // address of the word to be read
output wire [31:0] rdata, // data read
output wire rbusy, // asserted if busy receiving data
// SPI flash pins
output wire CLK, // clock
output reg CS_N, // chip select negated (active low)
output wire MOSI, // master out slave in (data to be sent to flash)
input wire MISO // master in slave out (data received from flash)
);
reg [5:0] snd_bitcount;
reg [31:0] cmd_addr;
reg [5:0] rcv_bitcount;
reg [31:0] rcv_data;
wire sending = (snd_bitcount != 0);
wire receiving = (rcv_bitcount != 0);
wire busy = sending | receiving;
assign rbusy = !CS_N;
assign MOSI = cmd_addr[31];
initial CS_N = 1'b1;
assign CLK = !CS_N && !clk; // CLK needs to be inverted (sample on posedge, shift of negedge)
// and needs to be disabled when not sending/receiving (&& !CS_N).
// since least significant bytes are read first, we need to swizzle...
assign rdata = {rcv_data[7:0],rcv_data[15:8],rcv_data[23:16],rcv_data[31:24]};
always @(posedge clk) begin
if(rstrb) begin
CS_N <= 1'b0;
cmd_addr <= {8'h03, 2'b00,word_address[19:0], 2'b00};
snd_bitcount <= 6'd32;
end else begin
if(sending) begin
if(snd_bitcount == 1) begin
rcv_bitcount <= 6'd32;
end
snd_bitcount <= snd_bitcount - 6'd1;
cmd_addr <= {cmd_addr[30:0],1'b1};
end
if(receiving) begin
rcv_bitcount <= rcv_bitcount - 6'd1;
rcv_data <= {rcv_data[30:0],MISO};
end
if(!busy) begin
CS_N <= 1'b1;
end
end
end
endmodule
`endif
/********************************************************************************************************************************/
`ifdef SPI_FLASH_FAST_READ
module MappedSPIFlash(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire [19:0] word_address, // address of the word to be read
output wire [31:0] rdata, // data read
output wire rbusy, // asserted if busy receiving data
// SPI flash pins
output wire CLK, // clock
output reg CS_N, // chip select negated (active low)
output wire MOSI, // master out slave in (data to be sent to flash)
input wire MISO // master in slave out (data received from flash)
);
reg [5:0] snd_bitcount;
reg [31:0] cmd_addr;
reg [5:0] rcv_bitcount;
reg [31:0] rcv_data;
wire sending = (snd_bitcount != 0);
wire receiving = (rcv_bitcount != 0);
wire busy = sending | receiving;
assign rbusy = !CS_N;
assign MOSI = cmd_addr[31];
initial CS_N = 1'b1;
assign CLK = !CS_N && !clk;
// since least significant bytes are read first, we need to swizzle...
assign rdata = {rcv_data[7:0],rcv_data[15:8],rcv_data[23:16],rcv_data[31:24]};
always @(posedge clk) begin
if(rstrb) begin
CS_N <= 1'b0;
cmd_addr <= {8'h0b, 2'b00,word_address[19:0], 2'b00};
snd_bitcount <= 6'd40; // TODO: check dummy clocks
end else begin
if(sending) begin
if(snd_bitcount == 1) begin
rcv_bitcount <= 6'd32;
end
snd_bitcount <= snd_bitcount - 6'd1;
cmd_addr <= {cmd_addr[30:0],1'b1};
end
if(receiving) begin
rcv_bitcount <= rcv_bitcount - 6'd1;
rcv_data <= {rcv_data[30:0],MISO};
end
if(!busy) begin
CS_N <= 1'b1;
end
end
end
endmodule
`endif
/********************************************************************************************************************************/
`ifdef SPI_FLASH_FAST_READ_DUAL_OUTPUT
module MappedSPIFlash(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire [19:0] word_address, // address of the word to be read
output wire [31:0] rdata, // data read
output wire rbusy, // asserted if busy receiving data
// SPI flash pins
output wire CLK, // clock
output reg CS_N, // chip select negated (active low)
inout wire MOSI, // master out slave in (data to be sent to flash)
input wire MISO // master in slave out (data received from flash)
);
wire MOSI_out;
wire MOSI_in;
wire MOSI_oe;
assign MOSI = MOSI_oe ? MOSI_out : 1'bZ;
assign MOSI_in = MOSI;
reg [5:0] snd_bitcount;
reg [31:0] cmd_addr;
reg [5:0] rcv_bitcount;
reg [31:0] rcv_data;
wire sending = (snd_bitcount != 0);
wire receiving = (rcv_bitcount != 0);
wire busy = sending | receiving;
assign rbusy = !CS_N;
assign MOSI_oe = !receiving;
assign MOSI_out = sending && cmd_addr[31];
initial CS_N = 1'b1;
assign CLK = !CS_N && !clk;
// since least significant bytes are read first, we need to swizzle...
assign rdata = {rcv_data[7:0],rcv_data[15:8],rcv_data[23:16],rcv_data[31:24]};
always @(posedge clk) begin
if(rstrb) begin
CS_N <= 1'b0;
cmd_addr <= {8'h3b, 2'b00,word_address[19:0], 2'b00};
snd_bitcount <= 6'd40; // TODO: check dummy clocks
end else begin
if(sending) begin
if(snd_bitcount == 1) begin
rcv_bitcount <= 6'd32;
end
snd_bitcount <= snd_bitcount - 6'd1;
cmd_addr <= {cmd_addr[30:0],1'b1};
end
if(receiving) begin
rcv_bitcount <= rcv_bitcount - 6'd2;
rcv_data <= {rcv_data[29:0],MISO,MOSI_in};
end
if(!busy) begin
CS_N <= 1'b1;
end
end
end
endmodule
`endif
/********************************************************************************************************************************/
`ifdef SPI_FLASH_FAST_READ_DUAL_IO
module MappedSPIFlash(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire [19:0] word_address, // address to be read
output wire [31:0] rdata, // data read
output wire rbusy, // asserted if busy receiving data
output wire CLK, // clock
output reg CS_N, // chip select negated (active low)
inout wire [1:0] IO // two bidirectional IO pins
);
reg [4:0] clock_cnt; // send/receive clock, 2 bits per clock (dual IO)
reg [39:0] shifter; // used for sending and receiving
reg dir; // 1 if sending, 0 otherwise
wire busy = (clock_cnt != 0);
wire sending = (dir && busy);
wire receiving = (!dir && busy);
assign rbusy = !CS_N;
// The two data pins IO0 (=MOSI) and IO1 (=MISO) used in bidirectional mode.
reg IO_oe = 1'b1;
wire [1:0] IO_out = shifter[39:38];
wire [1:0] IO_in = IO;
assign IO = IO_oe ? IO_out : 2'bZZ;
initial CS_N = 1'b1;
assign CLK = !CS_N && !clk;
// since least significant bytes are read first, we need to swizzle...
assign rdata={shifter[7:0],shifter[15:8],shifter[23:16],shifter[31:24]};
// Duplicates the bits (used because when sending command, dual IO is
// not active yet, and I do not want to have a separate shifter for
// the command and for the args...).
function [15:0] bbyyttee;
input [7:0] x;
begin
bbyyttee = {
x[7],x[7],x[6],x[6],x[5],x[5],x[4],x[4],
x[3],x[3],x[2],x[2],x[1],x[1],x[0],x[0]
};
end
endfunction
always @(posedge clk) begin
if(rstrb) begin
CS_N <= 1'b0;
IO_oe <= 1'b1;
dir <= 1'b1;
shifter <= {bbyyttee(8'hbb), 2'b00, word_address[19:0], 2'b00};
clock_cnt <= 5'd20 + `SPI_FLASH_DUMMY_CLOCKS; // cmd: 8 clocks address: 12 clocks + dummy clocks
end else begin
if(busy) begin
shifter <= {shifter[37:0], (receiving ? IO_in : 2'b11)};
clock_cnt <= clock_cnt - 5'd1;
if(dir && clock_cnt == 1) begin
clock_cnt <= 5'd16; // 32 bits, 2 bits per clock
IO_oe <= 1'b0;
dir <= 1'b0;
end
end else begin
CS_N <= 1'b1;
end
end
end
endmodule
/*
// 04/02/2021 This version optimized by Matthias Koch
module MappedSPIFlash(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire [19:0] word_address, // read address
output wire [31:0] rdata, // data read
output wire rbusy, // asserted if busy receiving data
output wire CLK, // clock
output wire CS_N, // chip select negated (active low)
inout wire [1:0] IO // two bidirectional IO pins
);
reg [6:0] clock_cnt; // send/receive clock, 2 bits per clock (dual IO)
reg [39:0] shifter; // used for sending and receiving
wire busy = ~clock_cnt[6];
assign CS_N = clock_cnt[6];
assign rbusy = busy;
assign CLK = busy & !clk; // CLK needs to be disabled when not active.
// Since least significant bytes are read first, we need to swizzle...
assign rdata={shifter[7:0],shifter[15:8],shifter[23:16],shifter[31:24]};
// The two data pins IO0 (=MOSI) and IO1 (=MISO) used in bidirectional mode.
wire [1:0] IO_out = shifter[39:38];
wire [1:0] IO_in = IO;
assign IO = clock_cnt > 7'd15 ? IO_out : 2'bZZ;
// assign IO = |clock_cnt[5:4] ? IO_out : 2'bZZ; // optimized version of the line above
always @(posedge clk) begin
if(rstrb) begin
shifter <= {16'hCFCF, 2'b00, word_address[19:0], 2'b00}; // 16'hCFCF is 8'hbb with bits doubled
clock_cnt <= 7'd43; // cmd: 8 clocks address: 12 clocks dummy: 8 clocks. data: 16 clocks, 2 bits per clock
end else begin
if(busy) begin
shifter <= {shifter[37:0], IO_in};
clock_cnt <= clock_cnt - 7'd1;
end
end
end
endmodule
*/
`endif

40
RTL/DEVICES/SDCard.v Normal file
View File

@@ -0,0 +1,40 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: driver for SDCard (does nearly nothing,
// for now it is just an interface for software bitbanging,
// see FIRMWARE/LIBFEMTORV32/spi_sd.c)
//
module SDCard(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire wstrb, // write strobe
input wire sel, // select (read/write ignored if low)
input wire [31:0] wdata, // data to be written
output wire [31:0] rdata, // read data
output wire MOSI,
input wire MISO,
output wire CS_N,
output wire CLK
);
reg [2:0] state; // CS_N,CLK,MOSI
assign CS_N = state[2];
assign CLK = state[1];
assign MOSI = state[0];
initial begin
state = 3'b100;
end
assign rdata = (sel ? {31'b0, MISO} : 32'b0);
always @(posedge clk) begin
if(sel && wstrb) begin
state <= wdata[2:0];
end
end
endmodule

156
RTL/DEVICES/SSD1351_1331.v Normal file
View File

@@ -0,0 +1,156 @@
// femtorv32, a minimalistic RISC-V RV32I core
// Bruno Levy, 2020-2021
//
// This file: driver for SSD1351 and SSD1331 OLED display
// Reference: https://www.crystalfontz.com/controllers/SolomonSystech/SSD1351/
//
// TODO: we could use wmask to write directly 16 bits or 32 bits of data
// (we could even have a 'fast clear' option that writes a number
// of zeroes).
`ifdef NRV_IO_SSD1331
`define NRV_IO_SSD1351_1331
`endif
`ifdef NRV_IO_SSD1351
`define NRV_IO_SSD1351_1331
`endif
module SSD1351_clk #(
parameter width=1
)(
input wire clk, // input system clock
output wire CLK, // SSD1351 clock
output wire CLK_falling_edge // pulses at each falling edge of CLK
);
reg [width-1:0] slow_cnt;
always @(posedge clk) begin
slow_cnt <= slow_cnt + 1;
end
assign CLK = slow_cnt[width-1];
assign CLK_falling_edge = (slow_cnt == (1 << width)-1);
endmodule
module SSD1351(
input wire clk, // system clock
input wire wstrb, // write strobe (use one of sel_xxx to select dest)
input wire sel_cntl, // wdata[0]: !CS; wdata[1]: RST
input wire sel_cmd, // send 8-bits command to display
input wire sel_dat, // send 8-bits data to display
input wire sel_dat16, // send 16-bits data to display
input wire [31:0] wdata, // data to be written
output wire wbusy, // asserted if the driver is busy sending data
// SSD1351 pins
output DIN, // data in
output CLK, // clock
output reg CS, // chip select (active low)
output reg DC, // data (high) / command (low)
output reg RST // reset (active low)
);
initial begin
DC = 1'b0;
RST = 1'b0;
CS = 1'b1;
end
/********* The clock ****************************************************/
// Note: SSD1351 expects the raising edges of the clock in the middle of
// the data bits.
// TODO: try to have a 'waveform' instead, that is shifted (simpler and
// more elegant).
// Page 52 of the doc: 4-wire SPI timing:
// Unclear what 'Clock Cycle Time' (220 ns) means,
// Clock Low Time (20ns) + Clock High Time (20ns) = 40ns
// max freq = 1/(40ns) = 25 MHz
// experimentally, seems to work up to 30 Mhz (but not more)
wire CLK_falling_edge;
generate
if(`NRV_FREQ <= 60) begin // Divide by 2-> 30 MHz
SSD1351_clk #(
.width(1)
)slow_clk(
.clk(clk),
.CLK(CLK),
.CLK_falling_edge(CLK_falling_edge)
);
end else if(`NRV_FREQ <= 120) begin // Divide by 4
SSD1351_clk #(
.width(2)
)slow_clk(
.clk(clk),
.CLK(CLK),
.CLK_falling_edge(CLK_falling_edge)
);
end else begin // Divide by 8
SSD1351_clk #(
.width(3)
)slow_clk(
.clk(clk),
.CLK(CLK),
.CLK_falling_edge(CLK_falling_edge)
);
end
endgenerate
// Currently sent bit, 1-based index
// (0000 config. corresponds to idle)
reg[4:0] bitcount = 5'b0000;
reg[15:0] shifter = 0;
wire sending = (bitcount != 0);
assign DIN = shifter[15];
assign wbusy = sending;
/*************************************************************************/
always @(posedge clk) begin
if(wstrb) begin
if(sel_cntl) begin
CS <= !wdata[0];
RST <= wdata[1];
end
if(sel_cmd) begin
RST <= 1'b1;
DC <= 1'b0;
shifter <= {wdata[7:0],8'b0};
bitcount <= 8;
CS <= 1'b1;
end
if(sel_dat) begin
RST <= 1'b1;
DC <= 1'b1;
shifter <= {wdata[7:0],8'b0};
bitcount <= 8;
CS <= 1'b1;
end
if(sel_dat16) begin
RST <= 1'b1;
DC <= 1'b1;
shifter <= wdata[15:0];
bitcount <= 16;
CS <= 1'b1;
end
end else begin
// detect falling edge of slow_clk
if(CLK_falling_edge) begin
if(sending) begin
if(CS) begin // first tick activates CS (low)
CS <= 1'b0;
end else begin // shift on falling edge
bitcount <= bitcount - 5'd1;
shifter <= {shifter[14:0], 1'b0};
end
end else begin // last tick deactivates CS (high)
CS <= 1'b1;
end
end
end
end
endmodule

View File

@@ -0,0 +1,51 @@
// Taken from: https://www.fpga4fun.com/HDMI.html
// (c) fpga4fun.com & KNJN LLC 2013
module TMDS_encoder(
input clk, // Pixel clock (25 MHz for 640x480)
input [7:0] VD, // video data (one of red, green or blue)
input [1:0] CD, // control data
input VDE, // video data enable, to choose between CD (when VDE=0) and VD (when VDE=1)
output reg [9:0] TMDS = 0 // The generated 10-bits signal (scrambled to minimize transitions, and 0/1-balanced)
);
/* verilator lint_off WIDTH */
/* verilator lint_off UNOPTFLAT */
wire [3:0] Nb1s = VD[0] + VD[1] + VD[2] + VD[3] + VD[4] + VD[5] + VD[6] + VD[7];
wire XNOR = (Nb1s>4'd4) || (Nb1s==4'd4 && VD[0]==1'b0);
// [Bruno Levy Jan 2021]
// Compact writing: wire [8:0] q_m = {~XNOR, q_m[6:0] ^ VD[7:1] ^ {7{XNOR}}, VD[0]};
// ... generates combinatorial loop warning, so I'd rather expand it (less compact,
// less elegant, but I did not like this combinatorial loop warning).
wire [8:0] q_m;
assign q_m[0] = VD[0];
assign q_m[1] = q_m[0] ^ VD[1] ^ XNOR;
assign q_m[2] = q_m[1] ^ VD[2] ^ XNOR;
assign q_m[3] = q_m[2] ^ VD[3] ^ XNOR;
assign q_m[4] = q_m[3] ^ VD[4] ^ XNOR;
assign q_m[5] = q_m[4] ^ VD[5] ^ XNOR;
assign q_m[6] = q_m[5] ^ VD[6] ^ XNOR;
assign q_m[7] = q_m[6] ^ VD[7] ^ XNOR;
assign q_m[8] = ~XNOR;
reg [3:0] balance_acc = 0;
wire [3:0] balance = q_m[0] + q_m[1] + q_m[2] + q_m[3] + q_m[4] + q_m[5] + q_m[6] + q_m[7] - 4'd4;
wire balance_sign_eq = (balance[3] == balance_acc[3]);
wire invert_q_m = (balance==0 || balance_acc==0) ? ~q_m[8] : balance_sign_eq;
wire [3:0] balance_acc_inc = balance - ({q_m[8] ^ ~balance_sign_eq} & ~(balance==0 || balance_acc==0));
wire [3:0] balance_acc_new = invert_q_m ? balance_acc-balance_acc_inc : balance_acc+balance_acc_inc;
wire [9:0] TMDS_data = {invert_q_m, q_m[8], q_m[7:0] ^ {8{invert_q_m}}};
wire [9:0] TMDS_code = CD[1] ? (CD[0] ? 10'b1010101011 : 10'b0101010100) : (CD[0] ? 10'b0010101011 : 10'b1101010100);
always @(posedge clk) begin
TMDS <= VDE ? TMDS_data : TMDS_code;
balance_acc <= VDE ? balance_acc_new : 4'h0;
end
/* verilator lint_on UNOPTFLAT */
/* verilator lint_on WIDTH */
endmodule

View File

@@ -0,0 +1,106 @@
/*
* PicoSoC - A simple example SoC using PicoRV32
*
* Copyright (C) 2017 Clifford Wolf <clifford@clifford.at>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
module ice40up5k_spram #(
// We current always use the whole SPRAM (128 kB)
parameter integer WORDS = 32768
) (
input clk,
input [3:0] wen,
input [14:0] addr,
input [31:0] wdata,
output [31:0] rdata
);
// [BL 04/2021] added simulation model
`ifdef BENCH_OR_LINT
reg [31:0] RAM[(WORDS/4)-1:0];
reg [31:0] rdata_reg;
assign rdata = rdata_reg;
always @(posedge clk) begin
/* verilator lint_off WIDTH */
if(wen[0]) RAM[addr][ 7:0 ] <= wdata[ 7:0 ];
if(wen[1]) RAM[addr][15:8 ] <= wdata[15:8 ];
if(wen[2]) RAM[addr][23:16] <= wdata[23:16];
if(wen[3]) RAM[addr][31:24] <= wdata[31:24];
rdata_reg <= RAM[addr];
/* verilator lint_on WIDTH */
end
`else
wire cs_0, cs_1;
wire [31:0] rdata_0, rdata_1;
assign cs_0 = !addr[14];
assign cs_1 = addr[14];
assign rdata = addr[14] ? rdata_1 : rdata_0;
SB_SPRAM256KA ram00 (
.ADDRESS(addr[13:0]),
.DATAIN(wdata[15:0]),
.MASKWREN({wen[1], wen[1], wen[0], wen[0]}),
.WREN(wen[1]|wen[0]),
.CHIPSELECT(cs_0),
.CLOCK(clk),
.STANDBY(1'b0),
.SLEEP(1'b0),
.POWEROFF(1'b1),
.DATAOUT(rdata_0[15:0])
);
SB_SPRAM256KA ram01 (
.ADDRESS(addr[13:0]),
.DATAIN(wdata[31:16]),
.MASKWREN({wen[3], wen[3], wen[2], wen[2]}),
.WREN(wen[3]|wen[2]),
.CHIPSELECT(cs_0),
.CLOCK(clk),
.STANDBY(1'b0),
.SLEEP(1'b0),
.POWEROFF(1'b1),
.DATAOUT(rdata_0[31:16])
);
SB_SPRAM256KA ram10 (
.ADDRESS(addr[13:0]),
.DATAIN(wdata[15:0]),
.MASKWREN({wen[1], wen[1], wen[0], wen[0]}),
.WREN(wen[1]|wen[0]),
.CHIPSELECT(cs_1),
.CLOCK(clk),
.STANDBY(1'b0),
.SLEEP(1'b0),
.POWEROFF(1'b1),
.DATAOUT(rdata_1[15:0])
);
SB_SPRAM256KA ram11 (
.ADDRESS(addr[13:0]),
.DATAIN(wdata[31:16]),
.MASKWREN({wen[3], wen[3], wen[2], wen[2]}),
.WREN(wen[3]|wen[2]),
.CHIPSELECT(cs_1),
.CLOCK(clk),
.STANDBY(1'b0),
.SLEEP(1'b0),
.POWEROFF(1'b1),
.DATAOUT(rdata_1[31:16])
);
`endif
endmodule

95
RTL/DEVICES/uart.v Normal file
View File

@@ -0,0 +1,95 @@
// femtorv32, a minimalistic RISC-V RV32I core
//
// Bruno Levy, 2020-2021
//
// This file: driver for UART (serial over USB)
// Wrapper around modified Claire Wolf's UART
`ifdef BENCH
// If BENCH is define, using a fake UART that displays
// each sent character.
module UART(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire wstrb, // write strobe
input wire sel_dat, // select data reg (rw)
input wire sel_cntl, // select control reg (r)
input wire [31:0] wdata, // data to be written
output wire [31:0] rdata, // data read
input wire RXD, // UART pins (unused in bench mode)
output wire TXD,
output reg brk // goes high one cycle when <ctrl><C> is pressed.
);
assign rdata = 32'b0;
assign TXD = 1'b0;
always @(posedge clk) begin
if(sel_dat && wstrb) begin
if(wdata == 32'd4) begin
$display("<end of simulation> (EOT sent to UART)");
$finish();
end
$write("%c",wdata[7:0]);
$fflush(32'h8000_0001);
end
end
endmodule
`else
// For some reasons, our 'compressed' version of
// the UART does not work on the ARTY, there is
// probably a couple of bugs there...
`ifdef ARTY
`include "uart_picosoc.v.orig"
`else
`include "uart_picosoc_shrunk.v"
`endif
module UART(
input wire clk, // system clock
input wire rstrb, // read strobe
input wire wstrb, // write strobe
input wire sel_dat, // select data reg (rw)
input wire sel_cntl, // select control reg (r)
input wire [31:0] wdata, // data to be written
output wire [31:0] rdata, // data read
input wire RXD, // UART pins
output wire TXD,
output reg brk // goes high one cycle when <ctrl><C> is pressed.
);
wire [7:0] rx_data;
wire [7:0] tx_data;
wire serial_tx_busy;
wire serial_valid;
buart #(
.FREQ_MHZ(`NRV_FREQ),
.BAUDS(115200)
) the_buart (
.clk(clk),
.resetq(!brk),
.tx(TXD),
.rx(RXD),
.tx_data(wdata[7:0]),
.rx_data(rx_data),
.busy(serial_tx_busy),
.valid(serial_valid),
.wr(sel_dat && wstrb),
.rd(sel_dat && rstrb)
);
assign rdata = sel_dat ? {22'b0, serial_tx_busy, serial_valid, rx_data}
: sel_cntl ? {22'b0, serial_tx_busy, serial_valid, 8'b0 }
: 32'b0;
always @(posedge clk) begin
brk <= serial_valid && (rx_data == 8'd3);
end
endmodule
`endif

View File

@@ -0,0 +1,131 @@
/*
* PicoSoC - A simple example SoC using PicoRV32
*
* Copyright (C) 2017 Clifford Wolf <clifford@clifford.at>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
// October 2019, Matthias Koch: Renamed wires
// December 2020, Bruno Levy: parameterization with freq and bauds
module buart #(
parameter FREQ_MHZ = 60,
parameter BAUDS = 115200
) (
input clk,
input resetq,
output tx,
input rx,
input wr,
input rd,
input [7:0] tx_data,
output [7:0] rx_data,
output busy,
output valid
);
parameter divider = FREQ_MHZ * 1000000 / BAUDS;
reg [3:0] recv_state;
reg [$clog2(divider)-1:0] recv_divcnt; // Counts to divider. Reserve enough bytes !
reg [7:0] recv_pattern;
reg [7:0] recv_buf_data;
reg recv_buf_valid;
reg [9:0] send_pattern;
reg send_dummy;
reg [3:0] send_bitcnt;
reg [$clog2(divider)-1:0] send_divcnt; // Counts to divider. Reserve enough bytes !
assign rx_data = recv_buf_data;
assign valid = recv_buf_valid;
assign busy = (send_bitcnt || send_dummy);
always @(posedge clk) begin
if (!resetq) begin
recv_state <= 0;
recv_divcnt <= 0;
recv_pattern <= 0;
recv_buf_data <= 0;
recv_buf_valid <= 0;
end else begin
recv_divcnt <= recv_divcnt + 1;
if (rd) recv_buf_valid <= 0;
case (recv_state)
0: begin
if (!rx)
recv_state <= 1;
end
1: begin
if (recv_divcnt > divider/2) begin
recv_state <= 2;
recv_divcnt <= 0;
end
end
10: begin
if (recv_divcnt > divider) begin
recv_buf_data <= recv_pattern;
recv_buf_valid <= 1;
recv_state <= 0;
end
end
default: begin
if (recv_divcnt > divider) begin
recv_pattern <= {rx, recv_pattern[7:1]};
recv_state <= recv_state + 1;
recv_divcnt <= 0;
end
end
endcase
end
end
assign tx = send_pattern[0];
always @(posedge clk) begin
send_divcnt <= send_divcnt + 1;
if (!resetq) begin
send_pattern <= ~0;
send_bitcnt <= 0;
send_divcnt <= 0;
send_dummy <= 1;
end else begin
if (send_dummy && !send_bitcnt) begin
send_pattern <= ~0;
send_bitcnt <= 15;
send_divcnt <= 0;
send_dummy <= 0;
end else if (wr && !send_bitcnt) begin
send_pattern <= {1'b1, tx_data[7:0], 1'b0};
send_bitcnt <= 10;
send_divcnt <= 0;
end else if (send_divcnt > divider && send_bitcnt) begin
send_pattern <= {1'b1, send_pattern[9:1]};
send_bitcnt <= send_bitcnt - 1;
send_divcnt <= 0;
end
end
end
endmodule

View File

@@ -0,0 +1,134 @@
/*
* PicoSoC - A simple example SoC using PicoRV32
*
* Copyright (C) 2017 Clifford Wolf <clifford@clifford.at>
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
*/
// October 2019, Matthias Koch: Renamed wires and optimizations.
// December 2020, Bruno Levy: parameterization with freq and bauds
// Factorized recv_divcnt and send_divcnt
// Additional LUT golfing tricks
module buart #(
parameter FREQ_MHZ = 12,
parameter BAUDS = 115200
) (
input clk,
input resetq,
output tx,
input rx,
input wr,
input rd,
input [7:0] tx_data,
output [7:0] rx_data,
output busy,
output valid
);
/************** Baud frequency constants ******************/
parameter divider = FREQ_MHZ * 1000000 / BAUDS;
parameter divwidth = $clog2(divider);
parameter baud_init = divider;
parameter half_baud_init = divider/2+1;
/************* Receiver ***********************************/
// Trick from Olof Kindgren: use n+1 bit and decrement instead of
// incrementing, and test the sign bit.
reg [divwidth:0] recv_divcnt;
wire recv_baud_clk = recv_divcnt[divwidth];
reg recv_state;
reg [8:0] recv_pattern;
reg [7:0] recv_buf_data;
reg recv_buf_valid;
assign rx_data = recv_buf_data;
assign valid = recv_buf_valid;
always @(posedge clk) begin
if (rd) recv_buf_valid <= 0;
if (!resetq) recv_buf_valid <= 0;
case (recv_state)
0: begin
if (!rx) begin
recv_state <= 1;
/* verilator lint_off WIDTH */
recv_divcnt <= half_baud_init;
/* verilator lint_on WIDTH */
end
recv_pattern <= 0;
end
1: begin
if (recv_baud_clk) begin
// Inverted start bit shifted through the whole register
// The idea is to use the start bit as marker
// for "reception complete",
// but as initialising registers to 10'b1_11111111_1
// is more costly than using zero,
// it is done with inverted logic.
if (recv_pattern[0]) begin
recv_buf_data <= ~recv_pattern[8:1];
recv_buf_valid <= 1;
recv_state <= 0;
end else begin
recv_pattern <= {~rx, recv_pattern[8:1]};
/* verilator lint_off WIDTH */
recv_divcnt <= baud_init;
/* verilator lint_on WIDTH */
end
end else recv_divcnt <= recv_divcnt - 1;
end
endcase
end
/************* Transmitter ******************************/
reg [divwidth:0] send_divcnt;
wire send_baud_clk = send_divcnt[divwidth];
reg [9:0] send_pattern = 1;
assign tx = send_pattern[0];
assign busy = |send_pattern[9:1];
// The transmitter shifts until the stop bit is on the wire,
// and stops shifting then.
always @(posedge clk) begin
if (wr) send_pattern <= {1'b1, tx_data[7:0], 1'b0};
else if (send_baud_clk & busy) send_pattern <= send_pattern >> 1;
/* verilator lint_off WIDTH */
if (wr | send_baud_clk) send_divcnt <= baud_init;
else send_divcnt <= send_divcnt - 1;
/* verilator lint_on WIDTH */
end
endmodule

55
RTL/PLL/femtopll.v Normal file
View File

@@ -0,0 +1,55 @@
/*
* The PLL, that generates the internal clock (high freq) from the
* external one (lower freq).
* Trying to make something that is portable between different boards
* For now, ICEStick, ULX3S, ECP5 evaluation boards, FOMU supported.
* WIP: IceFeather
*/
`ifdef BENCH_OR_LINT
`define PASSTHROUGH_PLL
`endif
/*
`ifdef TANGNANO9K
`define PASSTHROUGH_PLL
`endif
*/
/**********************************************************************/
`ifdef PASSTHROUGH_PLL
module femtoPLL #(
parameter freq = 60
) (
input pclk,
output clk
);
assign clk = pclk;
endmodule
`else
`ifdef ICE_STICK
`include "pll_icestick.v"
`elsif ICE_BREAKER
`include "pll_icebreaker.v"
`elsif ICE_FEATHER
`include "pll_icefeather.v"
`elsif ICE_SUGAR
`include "pll_icesugar.v"
`elsif ULX3S
`include "pll_ulx3s.v"
`elsif ECP5_EVN
`include "pll_ecp5_evn.v"
`elsif FOMU
`include "pll_fomu.v"
`elsif ARTY
`include "pll_arty.v"
`elsif CMODA7
`include "pll_cmod_a7.v"
`elsif TANGNANO9K
`include "pll_tangnano9k.v"
`elsif PRIMER20K
`include "pll_tangprimer20k.v"
`endif
`endif

35
RTL/PLL/frequencies.txt Normal file
View File

@@ -0,0 +1,35 @@
16
20
24
25
30
35
40
45
48
50
55
60
65
66
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
150
160
170
180
190
200

166
RTL/PLL/gen_pll.sh Executable file
View File

@@ -0,0 +1,166 @@
#!/bin/sh
# Automatically generates a PLL parameterized by output freq
# (instead of cryptic parameters)
if [ "$#" -ne 2 ]; then
echo "Usage: $0 FPGA_KIND INPUTFREQ" >&2
exit 1
fi
FPGA_KIND=$1
INPUTFREQ=$2
echo "/* "
echo " * Do not edit this file, it was generated by gen_pll.sh"
echo " * "
echo " * FPGA kind : $1"
echo " * Input frequency: $2 MHz"
echo " */"
case $FPGA_KIND in
"ICE40")
cat << EOF
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_CORE pll (
.REFERENCECLK(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
EOF
for OUTPUTFREQ in `cat frequencies.txt`
do
echo " $OUTPUTFREQ: begin"
icepll -i $INPUTFREQ -o $OUTPUTFREQ \
| egrep "DIVR|DIVF|DIVQ|FILTER_RANGE" \
| sed -e 's|[:()]||g' \
| awk '{printf(" defparam pll.%s = %s;\n",$1,$3);}'
echo " end"
done
cat <<EOF
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule
EOF
;;
"ECP5")
cat << EOF
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
(* ICP_CURRENT="12" *) (* LPF_RESISTOR="8" *) (* MFG_ENABLE_FILTEROPAMP="1" *) (* MFG_GMCREF_SEL="2" *)
EHXPLLL pll_i (
.RST(1'b0),
.STDBY(1'b0),
.CLKI(pclk),
.CLKOP(clk),
.CLKFB(clk),
.CLKINTFB(),
.PHASESEL0(1'b0),
.PHASESEL1(1'b0),
.PHASEDIR(1'b1),
.PHASESTEP(1'b1),
.PHASELOADREG(1'b1),
.PLLWAKESYNC(1'b0),
.ENCLKOP(1'b0)
);
defparam pll_i.PLLRST_ENA = "DISABLED";
defparam pll_i.INTFB_WAKE = "DISABLED";
defparam pll_i.STDBY_ENABLE = "DISABLED";
defparam pll_i.DPHASE_SOURCE = "DISABLED";
defparam pll_i.OUTDIVIDER_MUXA = "DIVA";
defparam pll_i.OUTDIVIDER_MUXB = "DIVB";
defparam pll_i.OUTDIVIDER_MUXC = "DIVC";
defparam pll_i.OUTDIVIDER_MUXD = "DIVD";
defparam pll_i.CLKOP_ENABLE = "ENABLED";
defparam pll_i.CLKOP_FPHASE = 0;
defparam pll_i.FEEDBK_PATH = "CLKOP";
generate
case(freq)
EOF
for OUTPUTFREQ in `cat frequencies.txt`
do
echo " $OUTPUTFREQ: begin"
ecppll -i $INPUTFREQ -o $OUTPUTFREQ -f tmp.v > tmp.txt
cat tmp.v \
| egrep "CLKI_DIV|CLKOP_DIV|CLKOP_CPHASE|CLKFB_DIV" \
| sed -e 's|[),.]| |g' -e 's|(|=|g' \
| awk '{printf(" defparam pll_i.%s;\n",$1);}'
rm -f tmp.v tmp.txt
echo " end"
done
cat <<EOF
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule
EOF
;;
"GOWIN")
cat << EOF
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
rPLL pll_i(
.CLKOUTP(),
.CLKOUTD(),
.CLKOUTD3(),
.RESET(1'b0),
.RESET_P(1'b0),
.CLKFB(1'b0),
.FBDSEL(6'b0),
.IDSEL(6'b0),
.ODSEL(6'b0),
.PSDA(4'b0),
.DUTYDA(4'b0),
.FDLY(4'b0),
.CLKIN(pclk),
.CLKOUT(clk)
);
defparam pll_i.FCLKIN="$INPUTFREQ";
generate
case(freq)
EOF
for OUTPUTFREQ in `cat frequencies.txt`
do
echo " $OUTPUTFREQ: begin"
gowin_pll -i $INPUTFREQ -o $OUTPUTFREQ -f tmp.v > tmp.txt
cat tmp.v \
| egrep "IDIV_SEL|FBDIV_SEL|ODIV_SEL" \
| sed -e 's|[),.]| |g' -e 's|(|=|g' \
| awk '{printf(" defparam pll_i.%s;\n",$1);}'
rm -f tmp.v tmp.txt
echo " end"
done
cat <<EOF
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule
EOF
;;
*)
echo FPGA_KIND needs to be one of ICE40,ECP5,GOWIN
exit 1
;;
esac

21
RTL/PLL/gen_plls.sh Executable file
View File

@@ -0,0 +1,21 @@
echo Generating PLL for FOMU
./gen_pll.sh ICE40 48 > pll_fomu.v
echo Generating PLL for IceFeather
./gen_pll.sh ICE40 12 > pll_icefeather.v
echo Generating PLL for IceStick
./gen_pll.sh ICE40 12 > pll_icestick.v
echo Generating PLL for IceSugar
./gen_pll.sh ICE40 12 > pll_icesugar.v
echo Generating PLL for ULX3S
./gen_pll.sh ECP5 25 > pll_ulx3s.v
echo Generating PLL for ECP5 evaluation board
./gen_pll.sh ECP5 12 > pll_ecp5_evn.v
echo Generating PLL for tangnano9k
./gen_pll.sh GOWIN 27 > pll_tangnano9k.v

39
RTL/PLL/pll_arty.v Normal file
View File

@@ -0,0 +1,39 @@
module femtoPLL #(
parameter freq = 50
) (
input wire pclk,
output wire clk
);
wire clk_feedback;
wire clk_internal;
// .CLKFBOUT_MULT(8)
// .CLKOUT0_DIVIDE(8*100/freq)
PLLE2_ADV #(
.BANDWIDTH("OPTIMIZED"), // OPTIMIZED, HIGH, LOW
.CLKFBOUT_MULT(freq/5), // Multiply value for all CLKOUT (2-64)
.CLKFBOUT_PHASE("0.0"), // Phase offset in degrees of CLKFB, (-360-360)
.CLKIN1_PERIOD("10.0"), // Input clock period in ns to ps resolution
.CLKOUT0_DIVIDE(20),
.CLKOUT0_DUTY_CYCLE("0.5"),
.CLKOUT0_PHASE("0.0"),
.DIVCLK_DIVIDE(1), // Master division value , (1-56)
.REF_JITTER1("0.0"), // Reference input jitter in UI (0.000-0.999)
.STARTUP_WAIT("FALSE") // Delayu DONE until PLL Locks, ("TRUE"/"FALSE")
) genclock(
.CLKOUT0(clk_internal),
.CLKFBOUT(clk_feedback), // 1-bit output, feedback clock
.CLKIN1(pclk),
.PWRDWN(1'b0),
.RST(1'b0),
.CLKFBIN(clk_feedback) // 1-bit input, feedback clock
);
BUFG bufg(
.I(clk_internal),
.O(clk)
);
endmodule

39
RTL/PLL/pll_cmod_a7.v Normal file
View File

@@ -0,0 +1,39 @@
module femtoPLL #(
parameter freq = 50
) (
input wire pclk,
output wire clk
);
wire clk_feedback;
wire clk_internal;
// .CLKFBOUT_MULT(8)
// .CLKOUT0_DIVIDE(8*100/freq)
PLLE2_ADV #(
.BANDWIDTH("OPTIMIZED"), // OPTIMIZED, HIGH, LOW
.CLKFBOUT_MULT(freq/5), // Multiply value for all CLKOUT (2-64)
.CLKFBOUT_PHASE("0.0"), // Phase offset in degrees of CLKFB, (-360-360)
.CLKIN1_PERIOD("10.0"), // Input clock period in ns to ps resolution
.CLKOUT0_DIVIDE(20),
.CLKOUT0_DUTY_CYCLE("0.5"),
.CLKOUT0_PHASE("0.0"),
.DIVCLK_DIVIDE(1), // Master division value , (1-56)
.REF_JITTER1("0.0"), // Reference input jitter in UI (0.000-0.999)
.STARTUP_WAIT("FALSE") // Delayu DONE until PLL Locks, ("TRUE"/"FALSE")
) genclock(
.CLKOUT0(clk_internal),
.CLKFBOUT(clk_feedback), // 1-bit output, feedback clock
.CLKIN1(pclk),
.PWRDWN(1'b0),
.RST(1'b0),
.CLKFBIN(clk_feedback) // 1-bit input, feedback clock
);
BUFG bufg(
.I(clk_internal),
.O(clk)
);
endmodule

256
RTL/PLL/pll_ecp5_evn.v Normal file
View File

@@ -0,0 +1,256 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ECP5
* Input frequency: 12 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
(* ICP_CURRENT="12" *) (* LPF_RESISTOR="8" *) (* MFG_ENABLE_FILTEROPAMP="1" *) (* MFG_GMCREF_SEL="2" *)
EHXPLLL pll_i (
.RST(1'b0),
.STDBY(1'b0),
.CLKI(pclk),
.CLKOP(clk),
.CLKFB(clk),
.CLKINTFB(),
.PHASESEL0(1'b0),
.PHASESEL1(1'b0),
.PHASEDIR(1'b1),
.PHASESTEP(1'b1),
.PHASELOADREG(1'b1),
.PLLWAKESYNC(1'b0),
.ENCLKOP(1'b0)
);
defparam pll_i.PLLRST_ENA = "DISABLED";
defparam pll_i.INTFB_WAKE = "DISABLED";
defparam pll_i.STDBY_ENABLE = "DISABLED";
defparam pll_i.DPHASE_SOURCE = "DISABLED";
defparam pll_i.OUTDIVIDER_MUXA = "DIVA";
defparam pll_i.OUTDIVIDER_MUXB = "DIVB";
defparam pll_i.OUTDIVIDER_MUXC = "DIVC";
defparam pll_i.OUTDIVIDER_MUXD = "DIVD";
defparam pll_i.CLKOP_ENABLE = "ENABLED";
defparam pll_i.CLKOP_FPHASE = 0;
defparam pll_i.FEEDBK_PATH = "CLKOP";
generate
case(freq)
16: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=37;
defparam pll_i.CLKOP_CPHASE=18;
defparam pll_i.CLKFB_DIV=4;
end
20: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=30;
defparam pll_i.CLKOP_CPHASE=15;
defparam pll_i.CLKFB_DIV=5;
end
24: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=25;
defparam pll_i.CLKOP_CPHASE=12;
defparam pll_i.CLKFB_DIV=2;
end
25: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=25;
defparam pll_i.CLKOP_CPHASE=12;
defparam pll_i.CLKFB_DIV=2;
end
30: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=20;
defparam pll_i.CLKOP_CPHASE=9;
defparam pll_i.CLKFB_DIV=5;
end
35: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=17;
defparam pll_i.CLKOP_CPHASE=8;
defparam pll_i.CLKFB_DIV=3;
end
40: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=15;
defparam pll_i.CLKOP_CPHASE=7;
defparam pll_i.CLKFB_DIV=10;
end
45: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=14;
defparam pll_i.CLKOP_CPHASE=6;
defparam pll_i.CLKFB_DIV=11;
end
48: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=12;
defparam pll_i.CLKOP_CPHASE=5;
defparam pll_i.CLKFB_DIV=4;
end
50: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=12;
defparam pll_i.CLKOP_CPHASE=5;
defparam pll_i.CLKFB_DIV=4;
end
55: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=11;
defparam pll_i.CLKOP_CPHASE=5;
defparam pll_i.CLKFB_DIV=9;
end
60: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=10;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=5;
end
65: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=11;
end
66: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=11;
end
70: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=17;
end
75: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=8;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=19;
end
80: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=20;
end
85: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=7;
end
90: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=15;
end
95: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=8;
end
100: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=25;
end
105: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=26;
end
110: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=28;
end
115: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=29;
end
120: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=10;
end
125: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=31;
end
130: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=32;
end
135: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=34;
end
140: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=35;
end
150: begin
defparam pll_i.CLKI_DIV=2;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=25;
end
160: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=40;
end
170: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=14;
end
180: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=15;
end
190: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=16;
end
200: begin
defparam pll_i.CLKI_DIV=3;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=50;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

238
RTL/PLL/pll_fomu.v Normal file
View File

@@ -0,0 +1,238 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ICE40
* Input frequency: 48 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_CORE pll (
.REFERENCECLK(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
16: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b110;
defparam pll.FILTER_RANGE = 3'b001;
end
20: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
24: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0001111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b100;
end
25: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
30: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0010011;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b100;
end
35: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100010;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
40: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
45: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0001110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b100;
end
48: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0001111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b100;
end
50: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
55: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
60: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0010011;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b100;
end
65: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b1000000;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
66: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0010101;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b100;
end
70: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
75: begin
defparam pll.DIVR = 4'b0001;
defparam pll.DIVF = 7'b0011000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b010;
end
80: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100111;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
85: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
90: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0001110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b100;
end
95: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
100: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
105: begin
defparam pll.DIVR = 4'b0001;
defparam pll.DIVF = 7'b0100010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b010;
end
110: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
115: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b1001100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
120: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0010011;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b100;
end
125: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b1010010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
130: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b1000000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
135: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b0101100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
140: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100010;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
150: begin
defparam pll.DIVR = 4'b0001;
defparam pll.DIVF = 7'b0011000;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b010;
end
160: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0100111;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
170: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
180: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0001110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b100;
end
190: begin
defparam pll.DIVR = 4'b0011;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
200: begin
defparam pll.DIVR = 4'b0010;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

202
RTL/PLL/pll_icebreaker.v Normal file
View File

@@ -0,0 +1,202 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ICE40
* Input frequency: 12 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_PAD pll (
.PACKAGEPIN(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
16: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010100;
defparam pll.DIVQ = 3'b110;
defparam pll.FILTER_RANGE = 3'b001;
end
20: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
24: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
25: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
30: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
35: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
40: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
45: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
48: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
50: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
55: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
60: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
65: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
66: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
70: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
75: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
80: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
85: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
90: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
95: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
100: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
105: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000101;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
110: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
115: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
120: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
125: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
130: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
135: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
140: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

238
RTL/PLL/pll_icefeather.v Normal file
View File

@@ -0,0 +1,238 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ICE40
* Input frequency: 12 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_CORE pll (
.REFERENCECLK(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
16: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010100;
defparam pll.DIVQ = 3'b110;
defparam pll.FILTER_RANGE = 3'b001;
end
20: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
24: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
25: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
30: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
35: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
40: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
45: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
48: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
50: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
55: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
60: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
65: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
66: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
70: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
75: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
80: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
85: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
90: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
95: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
100: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
105: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000101;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
110: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
115: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
120: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
125: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
130: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
135: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
140: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
150: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
160: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
170: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
180: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
190: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
200: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

238
RTL/PLL/pll_icestick.v Normal file
View File

@@ -0,0 +1,238 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ICE40
* Input frequency: 12 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_CORE pll (
.REFERENCECLK(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
16: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010100;
defparam pll.DIVQ = 3'b110;
defparam pll.FILTER_RANGE = 3'b001;
end
20: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
24: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
25: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
30: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
35: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
40: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
45: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
48: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
50: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
55: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
60: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
65: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
66: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
70: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
75: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
80: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
85: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
90: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
95: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
100: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
105: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000101;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
110: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
115: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
120: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
125: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
130: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
135: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
140: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
150: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
160: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
170: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
180: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
190: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
200: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

238
RTL/PLL/pll_icesugar.v Normal file
View File

@@ -0,0 +1,238 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ICE40
* Input frequency: 12 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
SB_PLL40_CORE pll (
.REFERENCECLK(pclk),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
defparam pll.FEEDBACK_PATH="SIMPLE";
defparam pll.PLLOUT_SELECT="GENCLK";
generate
case(freq)
16: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010100;
defparam pll.DIVQ = 3'b110;
defparam pll.FILTER_RANGE = 3'b001;
end
20: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
24: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
25: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
30: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b101;
defparam pll.FILTER_RANGE = 3'b001;
end
35: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
40: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
45: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
48: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
50: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
55: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
60: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
65: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
66: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010111;
defparam pll.DIVQ = 3'b100;
defparam pll.FILTER_RANGE = 3'b001;
end
70: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
75: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
80: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
85: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
90: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
95: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
100: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
105: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000101;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
110: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001000;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
115: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001100;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
120: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1001111;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
125: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010010;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
130: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1010110;
defparam pll.DIVQ = 3'b011;
defparam pll.FILTER_RANGE = 3'b001;
end
135: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
140: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0101110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
150: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110001;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
160: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0110100;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
170: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111000;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
180: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111011;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
190: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b0111110;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
200: begin
defparam pll.DIVR = 4'b0000;
defparam pll.DIVF = 7'b1000010;
defparam pll.DIVQ = 3'b010;
defparam pll.FILTER_RANGE = 3'b001;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

211
RTL/PLL/pll_tangnano9k.v Normal file
View File

@@ -0,0 +1,211 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : GOWIN
* Input frequency: 27 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
rPLL pll_i(
.CLKOUTP(),
.CLKOUTD(),
.CLKOUTD3(),
.RESET(1'b0),
.RESET_P(1'b0),
.CLKFB(1'b0),
.FBDSEL(6'b0),
.IDSEL(6'b0),
.ODSEL(6'b0),
.PSDA(4'b0),
.DUTYDA(4'b0),
.FDLY(4'b0),
.CLKIN(pclk),
.CLKOUT(clk)
);
defparam pll_i.FCLKIN="27";
generate
case(freq)
16: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=2;
defparam pll_i.ODIV_SEL=32;
end
20: begin
defparam pll_i.IDIV_SEL=3;
defparam pll_i.FBDIV_SEL=2;
defparam pll_i.ODIV_SEL=32;
end
24: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=7;
defparam pll_i.ODIV_SEL=32;
end
25: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=7;
defparam pll_i.ODIV_SEL=32;
end
30: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=9;
defparam pll_i.ODIV_SEL=16;
end
35: begin
defparam pll_i.IDIV_SEL=6;
defparam pll_i.FBDIV_SEL=8;
defparam pll_i.ODIV_SEL=16;
end
40: begin
defparam pll_i.IDIV_SEL=1;
defparam pll_i.FBDIV_SEL=2;
defparam pll_i.ODIV_SEL=16;
end
45: begin
defparam pll_i.IDIV_SEL=2;
defparam pll_i.FBDIV_SEL=4;
defparam pll_i.ODIV_SEL=16;
end
48: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=15;
defparam pll_i.ODIV_SEL=16;
end
50: begin
defparam pll_i.IDIV_SEL=6;
defparam pll_i.FBDIV_SEL=12;
defparam pll_i.ODIV_SEL=8;
end
55: begin
defparam pll_i.IDIV_SEL=0;
defparam pll_i.FBDIV_SEL=1;
defparam pll_i.ODIV_SEL=8;
end
60: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=19;
defparam pll_i.ODIV_SEL=8;
end
65: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=11;
defparam pll_i.ODIV_SEL=8;
end
66: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=21;
defparam pll_i.ODIV_SEL=8;
end
70: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=12;
defparam pll_i.ODIV_SEL=8;
end
75: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=24;
defparam pll_i.ODIV_SEL=8;
end
80: begin
defparam pll_i.IDIV_SEL=0;
defparam pll_i.FBDIV_SEL=2;
defparam pll_i.ODIV_SEL=8;
end
85: begin
defparam pll_i.IDIV_SEL=6;
defparam pll_i.FBDIV_SEL=21;
defparam pll_i.ODIV_SEL=8;
end
90: begin
defparam pll_i.IDIV_SEL=2;
defparam pll_i.FBDIV_SEL=9;
defparam pll_i.ODIV_SEL=8;
end
95: begin
defparam pll_i.IDIV_SEL=1;
defparam pll_i.FBDIV_SEL=6;
defparam pll_i.ODIV_SEL=8;
end
100: begin
defparam pll_i.IDIV_SEL=6;
defparam pll_i.FBDIV_SEL=25;
defparam pll_i.ODIV_SEL=4;
end
105: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=34;
defparam pll_i.ODIV_SEL=4;
end
110: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=36;
defparam pll_i.ODIV_SEL=4;
end
115: begin
defparam pll_i.IDIV_SEL=3;
defparam pll_i.FBDIV_SEL=16;
defparam pll_i.ODIV_SEL=4;
end
120: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=39;
defparam pll_i.ODIV_SEL=4;
end
125: begin
defparam pll_i.IDIV_SEL=7;
defparam pll_i.FBDIV_SEL=36;
defparam pll_i.ODIV_SEL=4;
end
130: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=23;
defparam pll_i.ODIV_SEL=4;
end
135: begin
defparam pll_i.IDIV_SEL=0;
defparam pll_i.FBDIV_SEL=4;
defparam pll_i.ODIV_SEL=4;
end
140: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=25;
defparam pll_i.ODIV_SEL=4;
end
150: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=49;
defparam pll_i.ODIV_SEL=4;
end
160: begin
defparam pll_i.IDIV_SEL=8;
defparam pll_i.FBDIV_SEL=52;
defparam pll_i.ODIV_SEL=4;
end
170: begin
defparam pll_i.IDIV_SEL=6;
defparam pll_i.FBDIV_SEL=43;
defparam pll_i.ODIV_SEL=4;
end
180: begin
defparam pll_i.IDIV_SEL=2;
defparam pll_i.FBDIV_SEL=19;
defparam pll_i.ODIV_SEL=4;
end
190: begin
defparam pll_i.IDIV_SEL=0;
defparam pll_i.FBDIV_SEL=6;
defparam pll_i.ODIV_SEL=4;
end
200: begin
defparam pll_i.IDIV_SEL=4;
defparam pll_i.FBDIV_SEL=36;
defparam pll_i.ODIV_SEL=4;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

View File

@@ -0,0 +1,33 @@
module femtoPLL #(
parameter freq = 54 // Default to 54 MHz
) (
input wire pclk,
output wire clk
);
// Tang Primer 20K (GW2A-18) PLL Configuration
// Input: 27 MHz
// Output: 54 MHz
rPLL #(
.FCLKIN("27"),
.DEVICE("GW2A-18"),
.IDIV_SEL(0), // Input Divider = 1
.FBDIV_SEL(15), // Feedback Divider = 16 (VCO = 27*1*16 = 432 MHz)
.ODIV_SEL(8) // Output Divider = 8 (Out = 432/8 = 54 MHz)
) pll_i (
.CLKOUTP(),
.CLKOUTD(),
.CLKOUTD3(),
.RESET(1'b0),
.RESET_P(1'b0),
.CLKFB(1'b0),
.FBDSEL(6'b0),
.IDSEL(6'b0),
.ODSEL(6'b0),
.PSDA(4'b0),
.DUTYDA(4'b0),
.FDLY(4'b0),
.CLKIN(pclk),
.CLKOUT(clk)
);
endmodule

256
RTL/PLL/pll_ulx3s.v Normal file
View File

@@ -0,0 +1,256 @@
/*
* Do not edit this file, it was generated by gen_pll.sh
*
* FPGA kind : ECP5
* Input frequency: 25 MHz
*/
module femtoPLL #(
parameter freq = 40
) (
input wire pclk,
output wire clk
);
(* ICP_CURRENT="12" *) (* LPF_RESISTOR="8" *) (* MFG_ENABLE_FILTEROPAMP="1" *) (* MFG_GMCREF_SEL="2" *)
EHXPLLL pll_i (
.RST(1'b0),
.STDBY(1'b0),
.CLKI(pclk),
.CLKOP(clk),
.CLKFB(clk),
.CLKINTFB(),
.PHASESEL0(1'b0),
.PHASESEL1(1'b0),
.PHASEDIR(1'b1),
.PHASESTEP(1'b1),
.PHASELOADREG(1'b1),
.PLLWAKESYNC(1'b0),
.ENCLKOP(1'b0)
);
defparam pll_i.PLLRST_ENA = "DISABLED";
defparam pll_i.INTFB_WAKE = "DISABLED";
defparam pll_i.STDBY_ENABLE = "DISABLED";
defparam pll_i.DPHASE_SOURCE = "DISABLED";
defparam pll_i.OUTDIVIDER_MUXA = "DIVA";
defparam pll_i.OUTDIVIDER_MUXB = "DIVB";
defparam pll_i.OUTDIVIDER_MUXC = "DIVC";
defparam pll_i.OUTDIVIDER_MUXD = "DIVD";
defparam pll_i.CLKOP_ENABLE = "ENABLED";
defparam pll_i.CLKOP_FPHASE = 0;
defparam pll_i.FEEDBK_PATH = "CLKOP";
generate
case(freq)
16: begin
defparam pll_i.CLKI_DIV=8;
defparam pll_i.CLKOP_DIV=38;
defparam pll_i.CLKOP_CPHASE=18;
defparam pll_i.CLKFB_DIV=5;
end
20: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=30;
defparam pll_i.CLKOP_CPHASE=15;
defparam pll_i.CLKFB_DIV=4;
end
24: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=24;
defparam pll_i.CLKOP_CPHASE=11;
defparam pll_i.CLKFB_DIV=1;
end
25: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=24;
defparam pll_i.CLKOP_CPHASE=11;
defparam pll_i.CLKFB_DIV=1;
end
30: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=20;
defparam pll_i.CLKOP_CPHASE=9;
defparam pll_i.CLKFB_DIV=6;
end
35: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=17;
defparam pll_i.CLKOP_CPHASE=8;
defparam pll_i.CLKFB_DIV=7;
end
40: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=15;
defparam pll_i.CLKOP_CPHASE=7;
defparam pll_i.CLKFB_DIV=8;
end
45: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=13;
defparam pll_i.CLKOP_CPHASE=6;
defparam pll_i.CLKFB_DIV=9;
end
48: begin
defparam pll_i.CLKI_DIV=8;
defparam pll_i.CLKOP_DIV=13;
defparam pll_i.CLKOP_CPHASE=6;
defparam pll_i.CLKFB_DIV=15;
end
50: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=12;
defparam pll_i.CLKOP_CPHASE=5;
defparam pll_i.CLKFB_DIV=2;
end
55: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=11;
defparam pll_i.CLKOP_CPHASE=5;
defparam pll_i.CLKFB_DIV=11;
end
60: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=10;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=12;
end
65: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=13;
end
66: begin
defparam pll_i.CLKI_DIV=8;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=21;
end
70: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=9;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=14;
end
75: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=8;
defparam pll_i.CLKOP_CPHASE=4;
defparam pll_i.CLKFB_DIV=3;
end
80: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=16;
end
85: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=17;
end
90: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=7;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=18;
end
95: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=3;
defparam pll_i.CLKFB_DIV=19;
end
100: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=4;
end
105: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=6;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=21;
end
110: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=22;
end
115: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=23;
end
120: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=24;
end
125: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=5;
end
130: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=5;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=26;
end
135: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=27;
end
140: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=28;
end
150: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=6;
end
160: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=2;
defparam pll_i.CLKFB_DIV=32;
end
170: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=4;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=34;
end
180: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=36;
end
190: begin
defparam pll_i.CLKI_DIV=5;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=38;
end
200: begin
defparam pll_i.CLKI_DIV=1;
defparam pll_i.CLKOP_DIV=3;
defparam pll_i.CLKOP_CPHASE=1;
defparam pll_i.CLKFB_DIV=8;
end
default: UNKNOWN_FREQUENCY unknown_frequency();
endcase
endgenerate
endmodule

16
RTL/PROCESSOR/README.md Normal file
View File

@@ -0,0 +1,16 @@
# FemtoRV processor collection
FemtoRV is a collection of small and understandable RISC-V processors.
See this table to choose the most suitable one for your project!
File name | ISA | Special capabilities
------------------------- | -------------- | --------
femtorv32_quark.v | RV32I | The smallest core in this collection, perfect for tiny FPGAs. For size reasons, it shifts only one bit per clock cycle.
femtorv32_quark_bicycle.v | RV32I | The simplest and fastest - in terms of cycles/instruction - core in this collection. Basically Quark with a barrel shifter and additional multiplexers. Recommended if you can afford a few more LUTs and just need a vanilla RV32I.
femtorv32_tachyon.v | RV32I | Quark with execute cycle split in two in order to achieve a higher maximum clock frequency, but at the expense of more cycles per instruction.
femtorv32_electron.v | RV32IM | Featuring barrel shifter, multiplication and division instructions.
femtorv32_intermissum.v | RV32IM + IRQ | Full interrupt support along with CSR registers.
femtorv32_gracilis.v | RV32IMC + IRQ | With compressed instructions support, saves both RAM usage and memory fetch cycles. Recommended as general-purpose processor.
femtorv32_individua.v | RV32IMAC + IRQ | Also available with atomic instructions support. Not really necessary in single processor designs, but probably useful if you have tricky interrupt handlers.
femtorv32_petitbateau.v | RV32IMFC + IRQ | Floating point!

View File

@@ -0,0 +1,7 @@
This directory contains several versions of femtorv32, that I'm using
for testing different features and influence on timings:
- testdrive_RV32IM: tachyon core (with two execute cycles) with M extension
- testdrive_RV32IM_simF: M extension, F decoder and simulated FPU (works only with Verilator)
- testdrive_RV32IMF: M and F extensions
I recommend using the other cores instead.

View File

@@ -0,0 +1,479 @@
/******************************************************************************/
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
// TestDrive: morphing tachyon into a RV32IMF core, trying to
// preserve maxfreq at each step.
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
//
/******************************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32im"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *) reg [7:0] funct3Is;
// Base RISC-V (RV32I) has only 10 different instructions !
reg isLoad;
reg isALUimm;
reg isAUIPC;
reg isStore;
reg isALUreg;
reg isLUI;
reg isBranch;
reg isJALR;
reg isJAL;
reg isSYSTEM;
reg [31:0] Uimm;
reg [31:0] Iimm;
reg [31:0] Simm;
reg [31:0] Bimm;
reg [31:0] Jimm;
always @(posedge clk) begin
if(state[WAIT_INSTR_bit] & !mem_rbusy) begin
isLoad <= (mem_rdata[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
isStore <= (mem_rdata[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
funct3Is <= 8'b00000001 << mem_rdata[14:12];
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
end
end
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = flip32(shifter);
/***************************************************************************/
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] alu_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
// 4->DIV 5->DIVU 6->REM 7->REMU
wire [31:0] alu_mul = funct3Is[0] ? multiply[31: 0] // 0:MUL
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
: (div_sign ? -quotient : quotient);
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
reg [31:0] aluOut;
wire funcM = instr[25];
wire isDivide = instr[14];
always @(posedge clk) begin
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
end
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg div_sign;
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [32:0] quotient_msk;
always @(posedge clk) begin
if (aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk[32] <= isALUreg & funcM & isDivide;
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
end else begin
divisor <= divisor >> 1;
quotient_msk <= quotient_msk >> 1;
if(divisor <= {31'b0, dividend}) begin
quotient <= {quotient[30:0],1'b1};
dividend <= dividend - divisor[31:0];
end else begin
quotient <= {quotient[30:0],1'b0};
end
end
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate_ =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
reg predicate;
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
reg [ADDR_WIDTH-1:0] PCplusImm;
// A separate adder to compute the destination of load/store.
reg [ADDR_WIDTH-1:0] loadstore_addr;
assign mem_addr = {ADDR_PAD,
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr
};
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
/* verilator lint_off WIDTH */
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
/* verilator lint_on WIDTH */
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0); // Load
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE1_bit = 2;
localparam EXECUTE2_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam NB_STATES = 5;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE1 = 1 << EXECUTE1_bit;
localparam EXECUTE2 = 1 << EXECUTE2_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE1_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) |
aluBusy;
`else
wire needToWait = isLoad | isStore | aluBusy;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE1; // also the declaration of instr).
end
end
state[EXECUTE1_bit]: begin
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to:
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// testing instr[5] is equivalent to testing isStore in this context.
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
predicate <= predicate_;
state <= EXECUTE2;
end
state[EXECUTE2_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,689 @@
/******************************************************************************/
// Electron: valid. fmax: 70 MHz exp. fmax: 80 MHz
// TestDrive: morphing tachyon into a RV32IMF core, trying to
// preserve maxfreq at each step.
// Step 0: Tachyon valid. fmax: 115-120 MHz exp. fmax: 135-140 MHz
// Step 1: Barrel shft valid. fmax: 110-115 MHz exp. fmax: 130-135 MHz
// Step 2: RV32M valid. fmax: 105-115 MHz exp. fmax: 120 MHz
// Step 3: RV32F decod only valid. fmax: 100-105 MHz exp. fmax: 105 MHz
//
/******************************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32imaf"
`define NRV_ABI "ilp32f"
//`define NRV_ARCH "rv32im"
//`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
// Check condition and display message in simulation
`ifdef BENCH
`define ASSERT(cond,msg) if(!(cond)) $display msg
`define ASSERT_NOT_REACHED(msg) $display msg
`else
`define ASSERT(cond,msg)
`define ASSERT_NOT_REACHED(msg)
`endif
// FPU Normalization needs to detect the position of the first bit set
// in the A_frac register. It is easier to count the number of leading
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
// https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count
module CLZ #(
parameter W_IN = 64, // must be power of 2, >= 2
parameter W_OUT = $clog2(W_IN)
) (
input wire [W_IN-1:0] in,
output wire [W_OUT-1:0] out
);
generate
if(W_IN == 2) begin
assign out = !in[1];
end else begin
wire [W_OUT-2:0] half_count;
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
wire left_empty = ~|lhs;
CLZ #(
.W_IN(W_IN/2)
) inner(
.in(left_empty ? rhs : lhs),
.out(half_count)
);
assign out = {left_empty, half_count};
end
endgenerate
endmodule
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *) reg [7:0] funct3Is;
// Instruction decoder and immediate decoder
// Base RISC-V (RV32I) has only 10 different instructions !
reg isLoad, isALUimm, isAUIPC, isStore, isALUreg, isLUI,
isBranch, isJALR, isJAL, isSYSTEM, isFPU;
reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm;
reg rdIsNZ; // Asserted if dest. register is non-zero (writeback)
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
isLoad <= (mem_rdata[6:3] == 4'b0000); // rd <- mem[rs1+Iimm]
isALUimm <= (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
isAUIPC <= (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
isStore <= (mem_rdata[6:3] == 4'b0100); // mem[rs1+Simm] <- rs2
isALUreg <= (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
isLUI <= (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
isBranch <= (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm
isJALR <= (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
isJAL <= (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
isSYSTEM <= (mem_rdata[6:2] == 5'b11100); // rd <- cycles
isFPU <= (mem_rdata[6:5] == 2'b10); // all FPU except FLW/FSW
funct3Is <= 8'b00000001 << mem_rdata[14:12];
Uimm <= { mem_rdata[31], mem_rdata[30:12], {12{1'b0}}};
Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
rdIsNZ <= |mem_rdata[11:7];
end
end
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] rs3; // this one is used by the FMA instructions.
reg [31:0] registerFile [0:63]; // 0..31: integer registers
// 32..63: floating-point registers
/***************************************************************************/
// The FPU
/***************************************************************************/
// instruction decoder
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD, isFADD, isFSUB, isFMUL, isFDIV,
isFSQRT, isFSGNJ, isFSGNJN, isFSGNJX, isFMIN, isFMAX, isFEQ, isFLT,
isFLE, isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW,
isFMVWX;
reg rdIsFP; // Asserted if destination register is a FP register.
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire rs1IsFP = (mem_rdata[6:5] == 2'b10 ) &&
!((mem_rdata[4:2] == 3'b100) && (
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
)
);
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001);
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
isFMADD <= (mem_rdata[4:2] == 3'b000);
isFMSUB <= (mem_rdata[4:2] == 3'b001);
isFNMSUB <= (mem_rdata[4:2] == 3'b010);
isFNMADD <= (mem_rdata[4:2] == 3'b011);
isFADD <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000));
isFSUB <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001));
isFMUL <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010));
isFDIV <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011));
isFSQRT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011));
isFSGNJ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00));
isFSGNJN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01));
isFSGNJX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10));
isFMIN <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]);
isFMAX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && mem_rdata[12]);
isFEQ <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10));
isFLT <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01));
isFLE <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00));
isFCLASS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && mem_rdata[12]);
isFCVTWS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]);
isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && mem_rdata[20]);
isFCVTSW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]);
isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && mem_rdata[20]);
isFMVXW <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]);
isFMVWX <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110));
rdIsFP <= (mem_rdata[6:2] == 5'b00001) || // FLW
(mem_rdata[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
(mem_rdata[6:4] == 3'b101 && (
(mem_rdata[31] == 1'b0) || // R-Type FPU
(mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
(mem_rdata[31:28] == 4'b1111) // FMV.W.X
)
);
end
end
reg [31:0] fpuOut;
`define FPU_OUT fpuOut
wire fpuBusy = 0;
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
// Fetch registers as soon as instruction is ready.
rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}];
rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}];
rs3 <= registerFile[{1'b1, mem_rdata[31:27]}];
end else if(state[EXECUTE2_bit] & isFPU) begin
`ifdef VERILATOR
(* parallel_case *)
case(1'b1)
isFMADD : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")");
isFMSUB : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")");
isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")");
isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")");
isFMUL : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")");
isFADD : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")");
isFSUB : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")");
isFDIV : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")");
isFSQRT : `FPU_OUT <= $c32("FSQRT(",rs1,")");
isFSGNJ : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")");
isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")");
isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")");
isFMIN : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")");
isFMAX : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")");
isFEQ : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")");
isFLE : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")");
isFLT : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")");
isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ;
isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")");
isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")");
isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")");
isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")");
isFMVXW: `FPU_OUT <= rs1;
isFMVWX: `FPU_OUT <= rs1;
endcase
`endif
// register write-back
end else if(
!(isBranch | isStore) & (rdIsFP | rdIsNZ) &
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit])
) begin
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
end
end
`ifdef VERILATOR
// When doing simulations, compare the result of all operations with
// what's computed on the host CPU.
reg [31:0] z;
reg [31:0] rs1_bkp;
reg [31:0] rs2_bkp;
reg [31:0] rs3_bkp;
always @(posedge clk) begin
// Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and
// rs3 as temporaty registers, so we need to save them to be able
// to recompute the operation on the host CPU.
if(isFPU && state[EXECUTE2_bit]) begin
rs1_bkp <= rs1;
rs2_bkp <= rs2;
rs3_bkp <= rs3;
end
if(
isFPU && state[WAIT_ALU_OR_MEM_bit] // && fpmi_PC == 0
) begin
case(1'b1)
isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")");
isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")");
isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")");
// my FDIV and FSQRT are not IEEE754 compliant !
// (checks commented-out for now)
// Note: checks use rs1_bkp and rs2_bkp because
// FDIV and FSQRT overwrite rs1 and rs2
//
//isFDIV:
// z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")");
//isFSQRT:
// z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")");
isFMADD :
z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFMSUB :
z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFNMSUB:
z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFNMADD:
z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")");
isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")");
isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")");
isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")");
isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")");
isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")");
isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")");
isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")");
isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")");
endcase
end
end
`endif
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except DIV
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = flip32(shifter);
/***************************************************************************/
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] alu_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
// funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
// 4->DIV 5->DIVU 6->REM 7->REMU
wire [31:0] alu_mul = funct3Is[0]
? multiply[31: 0] // 0:MUL
: multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend)
: (div_sign ? -quotient : quotient);
wire aluBusy = |quotient_msk; // ALU is busy if division in progress.
reg [31:0] aluOut;
wire funcM = instr[25];
wire isDivide = instr[14];
always @(posedge clk) begin
aluOut <= (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
end
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg div_sign;
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [32:0] quotient_msk;
always @(posedge clk) begin
if (aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk[32] <= isALUreg & funcM & isDivide;
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] ^ aluIn2[31]) & |aluIn2);
end else begin
divisor <= divisor >> 1;
quotient_msk <= quotient_msk >> 1;
if(divisor <= {31'b0, dividend}) begin
quotient <= {quotient[30:0],1'b1};
dividend <= dividend - divisor[31:0];
end else begin
quotient <= {quotient[30:0],1'b0};
end
end
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate_ =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
reg predicate;
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
reg [ADDR_WIDTH-1:0] PCplusImm;
// A separate adder to compute the destination of load/store.
reg [ADDR_WIDTH-1:0] loadstore_addr;
assign mem_addr = {ADDR_PAD,
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr
};
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
/* verilator lint_off WIDTH */
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
/* verilator lint_on WIDTH */
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isFPU ? fpuOut : 32'b0) | // FPU
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
(isJALR | isJAL ? {ADDR_PAD,PCplus4 } : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0); // Load
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word (=instr[13:12])
// - mem_addr[1:0]: indicates which byte/halfword is accessed
// - instr[2] is set for FLW and FSW.
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE1_bit = 2;
localparam EXECUTE2_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam NB_STATES = 5;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE1 = 1 << EXECUTE1_bit;
localparam EXECUTE2 = 1 << EXECUTE2_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// The memory-read signal.
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE1_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) |
aluBusy | isFPU;
`else
wire needToWait = isLoad | isStore | aluBusy | isFPU;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored
state <= EXECUTE1; // also the declaration of instr).
end
end
state[EXECUTE1_bit]: begin
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to:
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// testing instr[5] is equivalent to testing isStore in this context.
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
predicate <= predicate_;
state <= EXECUTE2;
end
state[EXECUTE2_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= FETCH_INSTR;
end
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
endmodule
/*****************************************************************************/

View File

@@ -0,0 +1,452 @@
/*******************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: The "electron", with RV32IM support.
// A single VERILOG file, compact & understandable code.
//
// Instruction set: RV32IM
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Bruno Levy, Matthias Koch, 2020-2021
/*******************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32im"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except division.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = divisor <= {31'b0, dividend};
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr;
/***************************************************************************/
// Counter.
/***************************************************************************/
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_cyclesh = (instr[31:20] == 12'hC80);
wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
wire needToWait = isLoad | isStore | isDivide;
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
PC <= PC_new;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,674 @@
/******************************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: The "Gracilis", with full interrupt and
// RVC compressed instructions support.
// A single VERILOG file, compact & understandable code.
//
// Instruction set: RV32IMC + CSR + MRET
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Bruno Levy, Matthias Koch, 2020-2021
/******************************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32imac"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
`define NRV_INTERRUPTS
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input interrupt_request,
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except divisions.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14];
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = (divisor <= {31'b0, dividend});
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) begin
divResult <= instr[13] ? dividendN : quotientN;
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
: {PC [ADDR_WIDTH-1:2], 2'b00}
: loadstore_addr ;
/* verilator lint_on WIDTH */
/***************************************************************************/
// Interrupt logic, CSR registers and opcodes.
/***************************************************************************/
// Remember interrupt requests as they are not checked for every cycle
reg interrupt_request_sticky;
// Interrupt enable and lock logic
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
// Processor accepts interrupts in EXECUTE state.
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
// If current interrupt is accepted, there already might be the next one,
// which should not be missed:
always @(posedge clk) begin
interrupt_request_sticky <=
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
end
// Decoder for mret opcode
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
// CSRs:
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
reg mstatus; // Interrupt enable
reg mcause; // Interrupt cause (and lock)
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_mstatus = (instr[31:20] == 12'h300);
wire sel_mtvec = (instr[31:20] == 12'h305);
wire sel_mepc = (instr[31:20] == 12'h341);
wire sel_mcause = (instr[31:20] == 12'h342);
wire sel_cycles = (instr[31:20] == 12'hC00);
wire sel_cyclesh = (instr[31:20] == 12'hC80);
// Read CSRs
/* verilator lint_off WIDTH */
wire [31:0] CSR_read =
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
(sel_mtvec ? mtvec : 32'b0) |
(sel_mepc ? mepc : 32'b0) |
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
(sel_cycles ? cycles[31:0] : 32'b0) |
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
/* verilator lint_on WIDTH */
// Write CSRs: 5 bit unsigned immediate or content of RS1
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
always @(posedge clk) begin
if(!reset) begin
mstatus <= 0;
end else begin
// Execute a CSR opcode
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
if (sel_mstatus) mstatus <= CSR_write[3];
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
end
end
end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
/* verilator lint_off WIDTH */
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0); // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/***************************************************************************/
// Unaligned fetch mechanism and compressed opcode handling
/***************************************************************************/
reg [ADDR_WIDTH-1:2] cached_addr;
reg [31:0] cached_data;
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
reg fetch_second_half;
reg long_instr;
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
: cached_mem;
wire [31:0] decompressed;
decompressor _decomp ( .c(decomp_input), .d(decompressed) );
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam WAIT_ALU_OR_MEM_SKIP_bit = 4;
localparam NB_STATES = 5;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) & (
state[EXECUTE_bit] |
state[WAIT_ALU_OR_MEM_bit] |
state[WAIT_ALU_OR_MEM_SKIP_bit]
);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (divide) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
wire needToWait = isLoad | isStore | isDivide;
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
interrupt_return ? mepc :
PCinc;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
mcause <= 0;
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
fetch_second_half <= 0;
end else begin
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
// Update cache
if (~current_cache_hit | fetch_second_half) begin
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
cached_data <= mem_rdata;
end;
// Decode instruction
rs1 <= registerFile[decompressed[19:15]];
rs2 <= registerFile[decompressed[24:20]];
instr <= decompressed[31:2];
long_instr <= &decomp_input[1:0];
// Long opcode, unaligned, first part fetched,
// happens in non-linear code
if (current_unaligned_long & ~fetch_second_half) begin
fetch_second_half <= 1;
state <= FETCH_INSTR;
end else begin
fetch_second_half <= 0;
state <= EXECUTE;
end
end
end
state[EXECUTE_bit]: begin
if (interrupt) begin
PC <= mtvec;
mepc <= PC_new;
mcause <= 1;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end else begin
PC <= PC_new;
if (interrupt_return) mcause <= 0;
state <= next_cache_hit & ~next_unaligned_long
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
fetch_second_half <= next_cache_hit & next_unaligned_long;
end
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= WAIT_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// if c[15:0] is a compressed instrution, decompresses it in d
// else copies c to d
module decompressor(
input wire [31:0] c,
output reg [31:0] d
);
// How to handle illegal and unknown opcodes
localparam illegal = 32'h00000000;
localparam unknown = 32'h00000000;
// Register decoder
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
wire [4:0] rwl = c[ 6:2]; // Register wide low
wire [4:0] rwh = c[11:7]; // Register wide high
localparam x0 = 5'b00000;
localparam x1 = 5'b00001;
localparam x2 = 5'b00010;
// Immediate decoder
wire [4:0] shiftImm = c[6:2];
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
/* verilator lint_off UNUSED */
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
/* verilator lint_on UNUSED */
always @*
casez (c[15:0])
// imm / funct7 + rs2 rs1 fn3 rd opcode
16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
/* verilator lint_off CASEOVERLAP */
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
/* verilator lint_on CASEOVERLAP */
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
default: d = unknown ; // Unknown opcode
endcase
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,730 @@
/******************************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: The "Individua", with full interrupt, atomic and
// RVC compressed instructions support.
// A single VERILOG file, compact & understandable code.
//
// Instruction set: RV32IMAC + CSR + MRET
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Bruno Levy, Matthias Koch, 2020-2021
/******************************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32imac"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
`define NRV_INTERRUPTS
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input interrupt_request,
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isAMO = (instr[6:2] == 5'b01011); // various
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except divisions.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = isAMO ? mem_rdata : rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isAMO | isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14];
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = (divisor <= {31'b0, dividend});
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) begin
divResult <= instr[13] ? dividendN : quotientN;
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Special ALU for atomic opcodes
/***************************************************************************/
wire [31:0] amoALU =
(instr[31:27] == 5'h00 ? aluPlus : 32'b0) | // amoadd.w
(instr[31:27] == 5'h01 ? aluIn2 : 32'b0) | // amoswap.w
(instr[31:27] == 5'h04 ? aluIn1 ^ aluIn2 : 32'b0) | // amoxor.w
(instr[31:27] == 5'h08 ? aluIn1 | aluIn2 : 32'b0) | // amoor.w
(instr[31:27] == 5'h0C ? aluIn1 & aluIn2 : 32'b0) | // amoand.w
(instr[31:27] == 5'h10 ? ( LT ? aluIn1 : aluIn2) : 32'b0) | // amomin.w
(instr[31:27] == 5'h14 ? (!LT ? aluIn1 : aluIn2) : 32'b0) | // amomax.w
(instr[31:27] == 5'h18 ? ( LTU ? aluIn1 : aluIn2) : 32'b0) | // amominu.w
(instr[31:27] == 5'h1C ? (!LTU ? aluIn1 : aluIn2) : 32'b0) ; // amomaxu.w
reg [31:0] amo_wdata;
wire amo_write = state[WRITE_AMO_bit] | state[WAIT_AMO_bit];
wire isAMOlr = instr[31:27] == 5'h02; // amolr.w
wire isAMOsc = instr[31:27] == 5'h03; // amosc.w
reg [ADDR_WIDTH-1:0] amo_location;
reg amo_location_unchanged;
wire reserved_addr = mem_addr[ADDR_WIDTH-1:0] == amo_location;
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
: {PC [ADDR_WIDTH-1:2], 2'b00}
: isAMO ? rs1[ADDR_WIDTH-1:0] : loadstore_addr;
/* verilator lint_on WIDTH */
/***************************************************************************/
// Interrupt logic, CSR registers and opcodes.
/***************************************************************************/
// Remember interrupt requests as they are not checked for every cycle
reg interrupt_request_sticky;
// Interrupt enable and lock logic
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
// Processor accepts interrupts in EXECUTE state.
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
// If current interrupt is accepted, there already might be the next one,
// which should not be missed:
always @(posedge clk) begin
interrupt_request_sticky <=
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
end
// Decoder for mret opcode
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
// CSRs:
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
reg mstatus; // Interrupt enable
reg mcause; // Interrupt cause (and lock)
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_mstatus = (instr[31:20] == 12'h300);
wire sel_mtvec = (instr[31:20] == 12'h305);
wire sel_mepc = (instr[31:20] == 12'h341);
wire sel_mcause = (instr[31:20] == 12'h342);
wire sel_cycles = (instr[31:20] == 12'hC00);
wire sel_cyclesh = (instr[31:20] == 12'hC80);
// Read CSRs
/* verilator lint_off WIDTH */
wire [31:0] CSR_read =
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
(sel_mtvec ? mtvec : 32'b0) |
(sel_mepc ? mepc : 32'b0) |
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
(sel_cycles ? cycles[31:0] : 32'b0) |
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
/* verilator lint_on WIDTH */
// Write CSRs: 5 bit unsigned immediate or content of RS1
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
always @(posedge clk) begin
if(!reset) begin
mstatus <= 0;
end else begin
// Execute a CSR opcode
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
if (sel_mstatus) mstatus <= CSR_write[3];
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
end
end
end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
/* verilator lint_off WIDTH */
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
(isLoad | isAMO & ~isAMOsc ? LOAD_data : 32'b0) | // Load, AMO
(isAMO & isAMOsc ? {31'b0, ~amo_location_unchanged} : 32'b0); // AMOsc
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = amo_write ? amo_wdata[ 7: 0] : rs2[7:0];
assign mem_wdata[15: 8] = amo_write ? amo_wdata[15: 8] : loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = amo_write ? amo_wdata[23:16] : loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = amo_write ? amo_wdata[31:24] : loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/***************************************************************************/
// Unaligned fetch mechanism and compressed opcode handling
/***************************************************************************/
reg [ADDR_WIDTH-1:2] cached_addr;
reg [31:0] cached_data;
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
reg fetch_second_half;
reg long_instr;
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
: cached_mem;
wire [31:0] decompressed;
decompressor _decomp ( .c(decomp_input), .d(decompressed) );
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam WRITE_AMO_bit = 4;
localparam WAIT_AMO_bit = 5;
localparam NB_STATES = 6;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
localparam WRITE_AMO = 1 << WRITE_AMO_bit;
localparam WAIT_AMO = 1 << WAIT_AMO_bit;
reg SkipFetch; // Skip fetch state later
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) & (
state[EXECUTE_bit] |
state[WAIT_ALU_OR_MEM_bit]
);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & (isLoad | isAMO & ~isAMOsc) | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & (isStore | isAMO & isAMOsc & reserved_addr & amo_location_unchanged) | state[WRITE_AMO_bit]}} & STORE_wmask;
// aluWr starts computation (divide) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
wire needToWait = isLoad | isStore | isDivide | isAMO;
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
interrupt_return ? mepc :
PCinc;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
mcause <= 0;
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
fetch_second_half <= 0;
SkipFetch <= 0;
amo_location <= 0;
amo_location_unchanged <= 0;
end else begin
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
// Update cache
if (~current_cache_hit | fetch_second_half) begin
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
cached_data <= mem_rdata;
end;
// Decode instruction
rs1 <= registerFile[decompressed[19:15]];
rs2 <= registerFile[decompressed[24:20]];
instr <= decompressed[31:2];
long_instr <= &decomp_input[1:0];
// Long opcode, unaligned, first part fetched,
// happens in non-linear code
if (current_unaligned_long & ~fetch_second_half) begin
fetch_second_half <= 1;
state <= FETCH_INSTR;
end else begin
fetch_second_half <= 0;
state <= EXECUTE;
end
end
end
state[EXECUTE_bit]: begin
if (interrupt) begin
PC <= mtvec;
mepc <= PC_new;
mcause <= 1;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
SkipFetch <= 0;
end else begin
PC <= PC_new;
if (interrupt_return) mcause <= 0;
state <= needToWait ? WAIT_ALU_OR_MEM :
next_cache_hit & ~next_unaligned_long ? WAIT_INSTR :
FETCH_INSTR;
SkipFetch <= next_cache_hit & ~next_unaligned_long;
fetch_second_half <= next_cache_hit & next_unaligned_long;
end
// Watching a reserved memory location
if (isAMO & isAMOlr) begin
amo_location <= rs1[ADDR_WIDTH-1:0];
amo_location_unchanged <= 1;
end else
if (isAMO | (isStore & reserved_addr)) begin
amo_location_unchanged <= 0;
end
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) begin
amo_wdata <= amoALU;
state <= isAMO & ~isAMOlr & ~isAMOsc ? WRITE_AMO :
SkipFetch ? WAIT_INSTR :
FETCH_INSTR ;
end
end
state[WRITE_AMO_bit]: begin
state <= WAIT_AMO;
end
state[WAIT_AMO_bit]: begin
if(!mem_wbusy) state <= SkipFetch ? WAIT_INSTR : FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// if c[15:0] is a compressed instrution, decompresses it in d
// else copies c to d
module decompressor(
input wire [31:0] c,
output reg [31:0] d
);
// How to handle illegal and unknown opcodes
localparam illegal = 32'h00000000;
localparam unknown = 32'h00000000;
// Register decoder
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
wire [4:0] rwl = c[ 6:2]; // Register wide low
wire [4:0] rwh = c[11:7]; // Register wide high
localparam x0 = 5'b00000;
localparam x1 = 5'b00001;
localparam x2 = 5'b00010;
// Immediate decoder
wire [4:0] shiftImm = c[6:2];
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
/* verilator lint_off UNUSED */
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
/* verilator lint_on UNUSED */
always @*
casez (c[15:0])
// imm / funct7 + rs2 rs1 fn3 rd opcode
16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
/* verilator lint_off CASEOVERLAP */
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
/* verilator lint_on CASEOVERLAP */
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
default: d = unknown ; // Unknown opcode
endcase
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,523 @@
/*******************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: The "Intermissum", with full interrupt support.
// A single VERILOG file, compact & understandable code.
//
// Instruction set: RV32IM + CSR + MRET
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Bruno Levy, Matthias Koch, 2020-2021
/*******************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32im"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-O3"
`define NRV_INTERRUPTS
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input interrupt_request,
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except divisions.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
wire aluBusy = |quotient_msk; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
wire [31:0] aluOut_muldiv =
( funct3Is[0] ? multiply[31: 0] : 32'b0) | // 0:MUL
( |funct3Is[3:1] ? multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
( instr[14] ? div_sign ? -divResult : divResult : 32'b0) ;
// 4:DIV, 5:DIVU, 6:REM, 7:REMU
wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [31:0] quotient_msk;
wire divstep_do = divisor <= {31'b0, dividend};
wire [31:0] dividendN = divstep_do ? dividend - divisor[31:0] : dividend;
wire [31:0] quotientN = divstep_do ? quotient | quotient_msk : quotient;
wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
always @(posedge clk) begin
if (isDivide & aluWr) begin
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
quotient_msk <= 1 << 31;
end else begin
dividend <= dividendN;
divisor <= divisor >> 1;
quotient <= quotientN;
quotient_msk <= quotient_msk >> 1;
end
end
reg [31:0] divResult;
always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr ;
/* verilator lint_on WIDTH */
/***************************************************************************/
// Interrupt logic, CSR registers and opcodes.
/***************************************************************************/
// Interrupt logic:
// Remember interrupt requests as they are not checked for every cycle
reg interrupt_request_sticky;
// Interrupt enable and lock logic
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
// Processor accepts interrupts in EXECUTE state.
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
// If current interrupt is accepted, there already might be the next one,
// which should not be missed:
always @(posedge clk) begin
interrupt_request_sticky <=
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
end
// Decoder for mret opcode
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
// CSRs:
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
reg mstatus; // Interrupt enable
reg mcause; // Interrupt cause (and lock)
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_mstatus = (instr[31:20] == 12'h300);
wire sel_mtvec = (instr[31:20] == 12'h305);
wire sel_mepc = (instr[31:20] == 12'h341);
wire sel_mcause = (instr[31:20] == 12'h342);
wire sel_cycles = (instr[31:20] == 12'hC00);
wire sel_cyclesh = (instr[31:20] == 12'hC80);
// Read CSRs:
/* verilator lint_off WIDTH */
wire [31:0] CSR_read =
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
(sel_mtvec ? mtvec : 32'b0) |
(sel_mepc ? mepc : 32'b0) |
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
(sel_cycles ? cycles[31:0] : 32'b0) |
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
/* verilator lint_on WIDTH */
// Write CSRs: 5 bit unsigned immediate or content of RS1
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
always @(posedge clk) begin
if(!reset) begin
mstatus <= 0;
end else begin
// Execute a CSR opcode
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
if (sel_mstatus) mstatus <= CSR_write[3];
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
end
end
end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
/* verilator lint_off WIDTH */
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
wire needToWait = isLoad | isStore | isDivide;
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
interrupt_return ? mepc :
PCplus4;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
mcause <= 0;
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
if (interrupt) begin
PC <= mtvec;
mepc <= PC_new;
mcause <= 1;
end else begin
PC <= PC_new;
if (interrupt_return) mcause <= 0;
end
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,790 @@
/******************************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: PetitBateau (make it float), RV32IMFC
// Rounding works as follows:
// - all subnormals are flushed to zero
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
// - FDIV and FSQRT do not have correct rounding
//
// [TODO] add FPU CSR (and instret for perf stat)]
// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
// [TODO] support IEEE754 denormals
// [TODO] NaNs propagation and infinity
// [TODO] support all IEEE754 rounding modes
//
// Bruno Levy, Matthias Koch, 2020-2021
/******************************************************************************/
`include "petitbateau.v"
// Firmware generation flags for this processor
// Note: atomic instructions not supported, but 'a' is set in
// compiler flag, because there is no toolchain/libs for
// rv32imfc / imf in most risc-V compiler distributions.
`define NRV_ARCH "rv32imafc"
`define NRV_ABI "ilp32f"
`define NRV_OPTIMIZE "-O3"
`define NRV_INTERRUPTS
// Check condition and display message in simulation
`ifdef BENCH
`define ASSERT(cond,msg) if(!(cond)) $display msg
`define ASSERT_NOT_REACHED(msg) $display msg
`else
`define ASSERT(cond,msg)
`define ASSERT_NOT_REACHED(msg)
`endif
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input interrupt_request,
input reset // set to 0 to reset the processor
);
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
wire [2:0] funct3 = instr[14:12];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isFPU = (instr[6:5] == 2'b10); // all FPU instr except FLW/FSW
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] rs3; // this one is used by the FMA instructions.
reg [31:0] registerFile [63:0]; // 0..31: integer registers
// 32..63: floating-point registers
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except divisions.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = flip32(shifter);
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14];
wire aluBusy = |div_cnt; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
reg [31:0] aluOut_mul;
always @(posedge clk) begin
aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
end
reg [31:0] aluOut_div;
always @(posedge clk) begin
(* parallel_case, full_case *)
case(1'b1)
instr[13] & div_sign: aluOut_div <= -dividend;
instr[13] & !div_sign: aluOut_div <= dividend;
!instr[13] & div_sign: aluOut_div <= -quotient;
!instr[13] & !div_sign: aluOut_div <= quotient;
endcase
end
reg [31:0] aluOut;
always @(*) begin
(* parallel_case *)
case(1'b1)
isALUreg & funcM & instr[14]: aluOut = aluOut_div;
isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
default: aluOut = aluOut_base;
endcase
end
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [5:0] div_cnt;
reg div_sign;
always @(posedge clk) begin
if (aluWr) begin
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
end else begin
if(aluBusy) div_cnt <= div_cnt - 1;
end
if(|div_cnt[5:1]) begin
divisor <= divisor >> 1;
if(divisor <= {31'b0, dividend}) begin
quotient <= {quotient[30:0],1'b1};
dividend <= dividend - divisor[31:0];
end else begin
quotient <= {quotient[30:0],1'b0};
end
end
end
/***************************************************************************/
// The predicate for conditional branches.
wire predicate = funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Registers read-write
/***************************************************************************/
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
// Fetch registers as soon as instruction is ready.
rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}];
rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
rs3 <= registerFile[{1'b1, raw_instr[31:27]}];
end else if(state[DECOMPRESS_GETREGS_bit]) begin
// For compressed instructions, fetch registers once decompressed.
rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
// no need to fetch rs3 here, there is no compressed FMA.
end else if(writeBack & !fpuBusy) begin
if(rdIsFP || |instr[11:7]) begin
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
end
end
end
/***************************************************************************/
// The FPU
/***************************************************************************/
wire fpuBusy;
wire [31:0] fpuOut;
PetitBateau FPU(
.clk(clk),
.wr(state[EXECUTE_bit] & isFPU),
.instr(instr[31:2]),
.rs1(rs1),
.rs2(rs2),
.rs3(rs3),
.busy(fpuBusy),
.out(fpuOut)
);
// There is a single register bank, registers 0..31 are the integer
// registers, and 32..63 are the floating point registers, hence
// bit 5 of rs1,rs2,rd index is set to 0 for an integer register
// and 1 for a fp register.
// asserted if the destination register is a floating-point register
wire rdIsFP = (instr[6:2] == 5'b00001) || // FLW
(instr[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
(instr[6:4] == 3'b101 && (
(instr[31] == 1'b0) || // R-Type FPU
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(instr[31:28] == 4'b1111) // FMV.W.X
)
);
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire raw_rs1IsFP = (raw_instr[6:5] == 2'b10 ) &&
!((raw_instr[4:2] == 3'b100) && (
(raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(raw_instr[31:28] == 4'b1111) // FMV.W.X
)
);
wire decomp_rs1IsFP = (instr[6:5] == 2'b10 ) &&
!((instr[4:2] == 3'b100) && (
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(instr[31:28] == 4'b1111) // FMV.W.X
)
);
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
wire decomp_rs2IsFP = (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
: {PC [ADDR_WIDTH-1:2], 2'b00}
: loadstore_addr ;
/* verilator lint_on WIDTH */
/***************************************************************************/
// Interrupt logic, CSR registers and opcodes.
/***************************************************************************/
// Remember interrupt requests as they are not checked for every cycle
reg interrupt_request_sticky;
// Interrupt enable and lock logic
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
// Processor accepts interrupts in EXECUTE state.
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
// If current interrupt is accepted, there already might be the next one,
// which should not be missed:
always @(posedge clk) begin
interrupt_request_sticky <=
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
end
// Decoder for mret opcode
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
// CSRs:
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
reg mstatus; // Interrupt enable
reg mcause; // Interrupt cause (and lock)
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_mstatus = (instr[31:20] == 12'h300);
wire sel_mtvec = (instr[31:20] == 12'h305);
wire sel_mepc = (instr[31:20] == 12'h341);
wire sel_mcause = (instr[31:20] == 12'h342);
wire sel_cycles = (instr[31:20] == 12'hC00);
wire sel_cyclesh = (instr[31:20] == 12'hC80);
// Read CSRs
/* verilator lint_off WIDTH */
wire [31:0] CSR_read =
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
(sel_mtvec ? mtvec : 32'b0) |
(sel_mepc ? mepc : 32'b0) |
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
(sel_cycles ? cycles[31:0] : 32'b0) |
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
/* verilator lint_on WIDTH */
// Write CSRs: 5 bit unsigned immediate or content of RS1
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
always @(posedge clk) begin
if(!reset) begin
mstatus <= 0;
end else begin
// Execute a CSR opcode
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
if (sel_mstatus) mstatus <= CSR_write[3];
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
end
end
end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
/* verilator lint_off WIDTH */
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isFPU ? fpuOut : 32'b0) | // FPU
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCinc : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
// TODO: support unaligned accesses for FLW and FSW
// instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/***************************************************************************/
// Unaligned fetch mechanism and compressed opcode handling
/***************************************************************************/
reg [ADDR_WIDTH-1:2] cached_addr;
reg [31:0] cached_data;
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
reg fetch_second_half;
reg long_instr;
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
: cached_mem;
wire [31:0] decompressed;
decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam DECOMPRESS_GETREGS_bit = 2;
localparam EXECUTE_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
localparam NB_STATES = 6;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam DECOMPRESS_GETREGS = 1 << DECOMPRESS_GETREGS_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
state[EXECUTE_bit] |
state[WAIT_ALU_OR_MEM_bit] |
state[WAIT_ALU_OR_MEM_SKIP_bit]
);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (divide) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
(isStore & `NRV_IS_IO_ADDR(mem_addr)) |
isALUreg & funcM /* isDivide */ |
isFPU;
`else
wire needToWait = isLoad |
isStore |
isALUreg & funcM /* isDivide */ |
isFPU;
`endif
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
interrupt_return ? mepc :
PCinc;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
mcause <= 0;
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
fetch_second_half <= 0;
end else begin
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
// Update cache
if (~current_cache_hit | fetch_second_half) begin
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
cached_data <= mem_rdata;
end;
// Decode instruction
// Registers are fetched at the same time, in the
// FPU's always block.
instr <= &raw_instr[1:0] ? raw_instr[31:2]
: decompressed[31:2];
long_instr <= &raw_instr[1:0];
// Long opcode, unaligned, first part fetched,
// happens in non-linear code
if (current_unaligned_long & ~fetch_second_half) begin
fetch_second_half <= 1;
state <= FETCH_INSTR;
end else begin
fetch_second_half <= 0;
state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
end
end
end
state[DECOMPRESS_GETREGS_bit]: begin
// All the registers are fetched in FPU's always block.
state <= EXECUTE;
end
state[EXECUTE_bit]: begin
if (interrupt) begin
PC <= mtvec;
mepc <= PC_new;
mcause <= 1;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end else begin
// Unaligned load/store not implemented yet
// (the norm supposes that FLW and FSW can handle them)
`ASSERT(
!((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]),
("PC=%x UNALIGNED FLW/FSW",PC)
);
PC <= PC_new;
if (interrupt_return) mcause <= 0;
state <= next_cache_hit & ~next_unaligned_long
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
fetch_second_half <= next_cache_hit & next_unaligned_long;
end
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= FETCH_INSTR;
end
end
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= WAIT_INSTR;
end
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
module decompressor(
input wire [15:0] c,
output reg [31:0] d
);
// Notes: * replaced illegal, unknown, x0, x1, x2 with
// 'localparam' instead of 'wire='
// * could split decoding into multiple cycles
// if decompressor is a bottleneck
// How to handle illegal and unknown opcodes
localparam illegal = 32'h0;
localparam unknown = 32'h0;
// Register decoder
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
wire [4:0] rwl = c[ 6:2]; // Register wide low
wire [4:0] rwh = c[11:7]; // Register wide high
localparam x0 = 5'b00000;
localparam x1 = 5'b00001;
localparam x2 = 5'b00010;
// Immediate decoder
wire [4:0] shiftImm = c[6:2];
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
/* verilator lint_off UNUSED */
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
/* verilator lint_on UNUSED */
always @*
casez (c[15:0])
// imm / funct7 + rs2 rs1 fn3 rd opcode
// 16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
/* verilator lint_off CASEOVERLAP */
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
/* verilator lint_on CASEOVERLAP */
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
// Four compressed RV32F load/store instructions
16'b011_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00001_11} ; // c.flw --> flw rd', offset[6:2](rs1')
16'b111_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01001_11} ; // c.fsw --> fsw rs2', offset[6:2](rs1')
16'b011__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00001_11} ; // c.flwsp --> flw rd, offset[7:2](x2)
16'b111__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01001_11} ; // c.fswsp --> fsw rs2, offset[7:2](x2)
// default: d = unknown ; // Unknown opcode
default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
endcase
endmodule
/*****************************************************************************/

View File

@@ -0,0 +1,420 @@
/*******************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
// This version: The "Quark", the most elementary version of FemtoRV32.
// A single VERILOG file, compact & understandable code.
// (200 lines of code, 400 lines counting comments)
//
// Instruction set: RV32I + RDCYCLES
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Macros:
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
// evaluate to 1 if addr is in mapped IO space,
// evaluate to 0 otherwise
// (additional wait states are used when in IO space).
// If left undefined, wait states are always used.
//
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
// by the ticks counter. If not defined, a 32-bits counter is generated.
// (reducing its width may be useful for space-constrained designs).
//
// NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
// (uses a two-level shifter inspired by picorv32).
//
// Bruno Levy, Matthias Koch, 2020-2021
/*******************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32i"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
wire isJAL = instr[3]; // (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
(* no_rw_check *)
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
reg [31:0] aluReg; // The internal register of the ALU, used by shift.
reg [4:0] aluShamt; // Current shift amount.
wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
wire aluWr; // ALU write strobe, starts shifting.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) |
(funct3IsShift ? aluReg : 32'b0) ;
wire funct3IsShift = funct3Is[1] | funct3Is[5];
always @(posedge clk) begin
if(aluWr) begin
if (funct3IsShift) begin // SLL, SRA, SRL
aluReg <= aluIn1;
aluShamt <= aluIn2[4:0];
end
end
`ifdef NRV_TWOLEVEL_SHIFTER
else if(|aluShamt[4:2]) begin // Shift by 4
aluShamt <= aluShamt - 4;
aluReg <= funct3Is[1] ? aluReg << 4 :
{{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
end else
`endif
// Compact form of:
// funct3=001 -> SLL (aluReg <= aluReg << 1)
// funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
// funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
if (|aluShamt) begin
aluShamt <= aluShamt - 1;
aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
{instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
end
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr ;
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) |
isALU & funct3IsShift;
`else
wire needToWait = isLoad | isStore | isALU & funct3IsShift;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
`ifdef BENCH
initial begin
cycles = 0;
aluShamt = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,409 @@
/*******************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
// This version: The "Quark", the most elementary version of FemtoRV32.
// A single VERILOG file, compact & understandable code.
// (200 lines of code, 400 lines counting comments)
//
// Instruction set: RV32I + RDCYCLES
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Macros:
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
// evaluate to 1 if addr is in mapped IO space,
// evaluate to 0 otherwise
// (additional wait states are used when in IO space).
// If left undefined, wait states are always used.
//
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
// by the ticks counter. If not defined, a 32-bits counter is generated.
// (reducing its width may be useful for space-constrained designs).
//
// Bruno Levy, Matthias Koch, 2020-2021
/*******************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32i"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ?
{aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
aluIn1[30], aluIn1[31]} : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = {
shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
shifter[30], shifter[31]};
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? PC :
state[EXECUTE_bit] & ~isLoad & ~isStore ? PC_new :
loadstore_addr ;
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE_bit = 2;
localparam WAIT_ALU_OR_MEM_bit = 3;
localparam NB_STATES = 4;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & ~isStore | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) ;
`else
wire needToWait = isLoad | isStore ;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE; // also the declaration of instr).
end
end
state[EXECUTE_bit]: begin
PC <= PC_new;
state <= needToWait ? WAIT_ALU_OR_MEM : WAIT_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,421 @@
/*******************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
// This version: The "Tachyon". It works like the "Quark", with the
// difference that EXECUTE is split into two steps. This allows
// higher maxfreq.
//
// Instruction set: RV32I + RDCYCLES
//
// Parameters:
// Reset address can be defined using RESET_ADDR (default is 0).
//
// The ADDR_WIDTH parameter lets you define the width of the internal
// address bus (and address computation logic).
//
// Macros:
// optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
// evaluate to 1 if addr is in mapped IO space,
// evaluate to 0 otherwise
// (additional wait states are used when in IO space).
// If left undefined, wait states are always used.
//
// NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
// by the ticks counter. If not defined, a 32-bits counter is generated.
// (reducing its width may be useful for space-constrained designs).
//
// NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
// (uses a two-level shifter inspired by picorv32).
//
// Bruno Levy, Matthias Koch, 2020-2021
/*******************************************************************/
// Firmware generation flags for this processor
`define NRV_ARCH "rv32i"
`define NRV_ABI "ilp32"
`define NRV_OPTIMIZE "-Os"
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input reset // set to 0 to reset the processor
);
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
// The destination register
wire [4:0] rdId = instr[11:7];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *)
wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm = { instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- cycles
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] registerFile [31:0];
always @(posedge clk) begin
if (writeBack)
if (rdId != 0)
registerFile[rdId] <= writeBackData;
end
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except shifts.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
reg [31:0] aluReg; // The internal register of the ALU, used by shift.
reg [4:0] aluShamt; // Current shift amount.
wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
wire aluWr; // ALU write strobe, starts shifting.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut = aluReg;
wire funct3IsShift = funct3Is[1] | funct3Is[5];
always @(posedge clk) begin
if(aluWr) begin
aluShamt <= funct3IsShift ? aluIn2[4:0] : 5'b0;
aluReg <=
(funct3IsShift ? aluIn1 : 32'b0 ) |
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
end
`ifdef NRV_TWOLEVEL_SHIFTER
else if(|aluShamt[3:2]) begin // Shift by 4
aluShamt <= aluShamt - 4;
aluReg <= funct3Is[1] ? aluReg << 4 :
{{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
end else
`endif
// Compact form of:
// funct3=001 -> SLL (aluReg <= aluReg << 1)
// funct3=101 & instr[30] -> SRA (aluReg <= {aluReg[31], aluReg[31:1]})
// funct3=101 & !instr[30] -> SRL (aluReg <= {1'b0, aluReg[31:1]})
if (|aluShamt) begin
aluShamt <= aluShamt - 1;
aluReg <= funct3Is[1] ? aluReg << 1 : // SLL
{instr[30] & aluReg[31], aluReg[31:1]}; // SRA,SRL
end
end
/***************************************************************************/
// The predicate for conditional branches.
/***************************************************************************/
wire predicate_ =
funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
reg predicate;
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
// An adder used to compute branch address, JAL address and AUIPC.
reg [ADDR_WIDTH-1:0] PCplusImm;
// A separate adder to compute the destination of load/store.
reg [ADDR_WIDTH-1:0] loadstore_addr;
/* verilator lint_off WIDTH */
// internal address registers and cycles counter may have less than
// 32 bits, so we deactivate width test for mem_addr and writeBackData
assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
PC : loadstore_addr ;
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? cycles : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isAUIPC ? PCplusImm : 32'b0) | // AUIPC
(isJALR | isJAL ? PCplus4 : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0) ; // Load
/* verilator lint_on WIDTH */
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
wire mem_byteAccess = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam EXECUTE1_bit = 2;
localparam EXECUTE2_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam NB_STATES = 5;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam EXECUTE1 = 1 << EXECUTE1_bit;
localparam EXECUTE2 = 1 << EXECUTE2_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) &
(state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (shifts) in the ALU.
assign aluWr = state[EXECUTE1_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
`ifdef NRV_IS_IO_ADDR
wire needToWait = isLoad |
isStore & `NRV_IS_IO_ADDR(mem_addr) |
aluBusy;
`else
wire needToWait = isLoad | isStore | aluBusy;
`endif
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
end else
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
rs1 <= registerFile[mem_rdata[19:15]];
rs2 <= registerFile[mem_rdata[24:20]];
instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
state <= EXECUTE1; // also the declaration of instr).
end
end
state[EXECUTE1_bit]: begin
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to:
// PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// testing instr[5] is equivalent to testing isStore in this context.
loadstore_addr <= rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
predicate <= predicate_;
state <= EXECUTE2;
end
state[EXECUTE2_bit]: begin
PC <= isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
PCplus4;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
/***************************************************************************/
// Cycle counter
/***************************************************************************/
`ifdef NRV_COUNTER_WIDTH
reg [`NRV_COUNTER_WIDTH-1:0] cycles;
`else
reg [31:0] cycles;
`endif
always @(posedge clk) cycles <= cycles + 1;
endmodule
/*****************************************************************************/
// Notes:
//
// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
// It is just a cleaner way of writing a series of cascaded if() statements,
// To understand it, think about the case statement *in general* as follows:
// case (expr)
// val_1: statement_1
// val_2: statement_2
// ... val_n: statement_n
// endcase
// The first statement_i such that expr == val_i is executed.
// Now if expr is 1'b1:
// case (1'b1)
// cond_1: statement_1
// cond_2: statement_2
// ... cond_n: statement_n
// endcase
// It is *exactly the same thing*, the first statement_i such that
// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
// in other words, such that cond_i is true)
// More on this:
// https://stackoverflow.com/questions/15418636/case-statement-in-verilog
//
// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
// It uses a larger number of bits (one bit per state), but often results in
// a both more compact (fewer LUTs) and faster state machine.

View File

@@ -0,0 +1,782 @@
/******************************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// This version: PetitBateau (make it float), RV32IMFC
// Rounding works as follows:
// - all subnormals are flushed to zero
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
// - FDIV and FSQRT do not have correct rounding
//
// [TODO] add FPU CSR (and instret for perf stat)]
// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
// [TODO] support IEEE754 denormals
// [TODO] NaNs propagation and infinity
// [TODO] support all IEEE754 rounding modes
//
// Bruno Levy, Matthias Koch, 2020-2021
/******************************************************************************/
`include "petitbateau.v"
// Firmware generation flags for this processor
// Note: atomic instructions not supported, but 'a' is set in
// compiler flag, because there is no toolchain/libs for
// rv32imfc / imf in most risc-V compiler distributions.
`define NRV_ARCH "rv32imafc"
`define NRV_ABI "ilp32f"
`define NRV_OPTIMIZE "-O0"
`define NRV_INTERRUPTS
// Check condition and display message in simulation
`ifdef BENCH
`define ASSERT(cond,msg) if(!(cond)) $display msg
`define ASSERT_NOT_REACHED(msg) $display msg
`else
`define ASSERT(cond,msg)
`define ASSERT_NOT_REACHED(msg)
`endif
module FemtoRV32(
input clk,
output [31:0] mem_addr, // address bus
output [31:0] mem_wdata, // data to be written
output [3:0] mem_wmask, // write mask for the 4 bytes of each word
input [31:0] mem_rdata, // input lines for both data and instr
output mem_rstrb, // active to initiate memory read (used by IO)
input mem_rbusy, // asserted if memory is busy reading value
input mem_wbusy, // asserted if memory is busy writing value
input interrupt_request,
input reset // set to 0 to reset the processor
);
// Flip a 32 bit word. Used by the shifter (a single shifter for
// left and right shifts, saves silicium !)
function [31:0] flip32;
input [31:0] x;
flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7],
x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15],
x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
endfunction
parameter RESET_ADDR = 32'h00000000;
parameter ADDR_WIDTH = 24;
localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
/***************************************************************************/
// Instruction decoding.
/***************************************************************************/
// Reference: Table page 104 of:
// https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
wire [2:0] funct3 = instr[14:12];
// The ALU function, decoded in 1-hot form (doing so reduces LUT count)
// It is used as follows: funct3Is[val] <=> funct3 == val
(* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
// The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
wire [31:0] Uimm={ instr[31], instr[30:12], {12{1'b0}}};
wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
/* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
/* verilator lint_on UNUSED */
// Base RISC-V (RV32I) has only 10 different instructions !
wire isLoad = (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
wire isALUimm = (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
wire isAUIPC = (instr[6:2] == 5'b00101); // rd <- PC + Uimm
wire isStore = (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
wire isALUreg = (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
wire isLUI = (instr[6:2] == 5'b01101); // rd <- Uimm
wire isBranch = (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
wire isJALR = (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
wire isJAL = (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
wire isSYSTEM = (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
wire isFPU = (instr[6:5] == 2'b10); // all FPU instr except FLW/FSW
wire isALU = isALUimm | isALUreg;
/***************************************************************************/
// The register file.
/***************************************************************************/
reg [31:0] rs1;
reg [31:0] rs2;
reg [31:0] rs3; // this one is used by the FMA instructions.
reg [31:0] registerFile [63:0]; // 0..31: integer registers
// 32..63: floating-point registers
/***************************************************************************/
// The ALU. Does operations and tests combinatorially, except divisions.
/***************************************************************************/
// First ALU source, always rs1
wire [31:0] aluIn1 = rs1;
// Second ALU source, depends on opcode:
// ALUreg, Branch: rs2
// ALUimm, Load, JALR: Iimm
wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
wire aluWr; // ALU write strobe, starts dividing.
// The adder is used by both arithmetic instructions and JALR.
wire [31:0] aluPlus = aluIn1 + aluIn2;
// Use a single 33 bits subtract to do subtraction and all comparisons
// (trick borrowed from swapforth/J1)
wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
wire LT = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
wire LTU = aluMinus[32];
wire EQ = (aluMinus[31:0] == 0);
/***************************************************************************/
// Use the same shifter both for left and right shifts by
// applying bit reversal
wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
/* verilator lint_off WIDTH */
wire [31:0] shifter =
$signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
/* verilator lint_on WIDTH */
wire [31:0] leftshift = flip32(shifter);
/***************************************************************************/
wire funcM = instr[25];
wire isDivide = isALUreg & funcM & instr[14];
wire aluBusy = |div_cnt; // ALU is busy if division is in progress.
// funct3: 1->MULH, 2->MULHSU 3->MULHU
wire isMULH = funct3Is[1];
wire isMULHSU = funct3Is[2];
wire sign1 = aluIn1[31] & isMULH;
wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
wire signed [32:0] signed1 = {sign1, aluIn1};
wire signed [32:0] signed2 = {sign2, aluIn2};
wire signed [63:0] multiply = signed1 * signed2;
/***************************************************************************/
// Notes:
// - instr[30] is 1 for SUB and 0 for ADD
// - for SUB, need to test also instr[5] to discriminate ADDI:
// (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
// - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
wire [31:0] aluOut_base =
(funct3Is[0] ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
(funct3Is[1] ? leftshift : 32'b0) |
(funct3Is[2] ? {31'b0, LT} : 32'b0) |
(funct3Is[3] ? {31'b0, LTU} : 32'b0) |
(funct3Is[4] ? aluIn1 ^ aluIn2 : 32'b0) |
(funct3Is[5] ? shifter : 32'b0) |
(funct3Is[6] ? aluIn1 | aluIn2 : 32'b0) |
(funct3Is[7] ? aluIn1 & aluIn2 : 32'b0) ;
reg [31:0] aluOut_mul;
always @(posedge clk) begin
aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
end
reg [31:0] aluOut_div;
always @(posedge clk) begin
(* parallel_case, full_case *)
case(1'b1)
instr[13] & div_sign: aluOut_div <= -dividend;
instr[13] & !div_sign: aluOut_div <= dividend;
!instr[13] & div_sign: aluOut_div <= -quotient;
!instr[13] & !div_sign: aluOut_div <= quotient;
endcase
end
reg [31:0] aluOut;
always @(*) begin
(* parallel_case *)
case(1'b1)
isALUreg & funcM & instr[14]: aluOut = aluOut_div;
isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
default: aluOut = aluOut_base;
endcase
end
/***************************************************************************/
// Implementation of DIV/REM instructions, highly inspired by PicoRV32
reg [31:0] dividend;
reg [62:0] divisor;
reg [31:0] quotient;
reg [5:0] div_cnt;
reg div_sign;
always @(posedge clk) begin
if (aluWr) begin
div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] :
(aluIn1[31] != aluIn2[31]) & |aluIn2);
dividend <= ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
divisor <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
quotient <= 0;
div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
end else begin
if(aluBusy) div_cnt <= div_cnt - 1;
end
if(|div_cnt[5:1]) begin
divisor <= divisor >> 1;
if(divisor <= {31'b0, dividend}) begin
quotient <= {quotient[30:0],1'b1};
dividend <= dividend - divisor[31:0];
end else begin
quotient <= {quotient[30:0],1'b0};
end
end
end
/***************************************************************************/
// The predicate for conditional branches.
wire predicate = funct3Is[0] & EQ | // BEQ
funct3Is[1] & !EQ | // BNE
funct3Is[4] & LT | // BLT
funct3Is[5] & !LT | // BGE
funct3Is[6] & LTU | // BLTU
funct3Is[7] & !LTU ; // BGEU
/***************************************************************************/
// Registers read-write
/***************************************************************************/
always @(posedge clk) begin
if(state[WAIT_INSTR_bit]) begin
// Fetch registers as soon as instruction is ready.
rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}];
rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
rs3 <= registerFile[{1'b1, raw_instr[31:27]}];
end else if(state[DECOMPRESS_GETREGS_bit]) begin
// For compressed instructions, fetch registers once decompressed.
rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
// no need to fetch rs3 here, there is no compressed FMA.
end else if(writeBack & !fpuBusy) begin
if(rdIsFP || |instr[11:7]) begin
registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
end
end
end
/***************************************************************************/
// The FPU
/***************************************************************************/
wire fpuBusy;
wire [31:0] fpuOut;
PetitBateau FPU(
.clk(clk),
.wr(state[EXECUTE_bit] & isFPU),
.instr(instr[31:2]),
.rs1(rs1),
.rs2(rs2),
.rs3(rs3),
.busy(fpuBusy),
.out(fpuOut)
);
// There is a single register bank, registers 0..31 are the integer
// registers, and 32..63 are the floating point registers, hence
// bit 5 of rs1,rs2,rd index is set to 0 for an integer register
// and 1 for a fp register.
// asserted if the destination register is a floating-point register
wire rdIsFP = (instr[6:2] == 5'b00001) || // FLW
(instr[6:4] == 3'b100 ) || // F{N}MADD,F{N}MSUB
(instr[6:4] == 3'b101 && (
(instr[31] == 1'b0) || // R-Type FPU
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(instr[31:28] == 4'b1111) // FMV.W.X
)
);
// rs1 is a FP register if instr[6:5] = 2'b10 except for:
// FCVT.S.W{U}: instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
// FMV.W.X : instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire raw_rs1IsFP = (raw_instr[6:5] == 2'b10 ) &&
!((raw_instr[4:2] == 3'b100) && (
(raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(raw_instr[31:28] == 4'b1111) // FMV.W.X
)
);
wire decomp_rs1IsFP = (instr[6:5] == 2'b10 ) &&
!((instr[4:2] == 3'b100) && (
(instr[31:28] == 4'b1101) || // FCVT.S.W{U}
(instr[31:28] == 4'b1111) // FMV.W.X
)
);
// rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
// (two versions of the signal, one for regular instruction decode,
// the other one for compressed instructions).
wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
wire decomp_rs2IsFP = (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);
/***************************************************************************/
// Program counter and branch target computation.
/***************************************************************************/
reg [ADDR_WIDTH-1:0] PC; // The program counter.
reg [31:2] instr; // Latched instruction. Note that bits 0 and 1 are
// ignored (not used in RV32I base instr set).
wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
wire [ADDR_WIDTH-1:0] PCinc = long_instr ? PCplus4 : PCplus2;
// An adder used to compute branch address, JAL address and AUIPC.
// branch->PC+Bimm AUIPC->PC+Uimm JAL->PC+Jimm
// Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
instr[4] ? Uimm[ADDR_WIDTH-1:0] :
Bimm[ADDR_WIDTH-1:0] );
// A separate adder to compute the destination of load/store.
// testing instr[5] is equivalent to testing isStore in this context.
wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
(instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
assign mem_addr = {ADDR_PAD,
state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
: {PC [ADDR_WIDTH-1:2], 2'b00}
: loadstore_addr
};
/***************************************************************************/
// Interrupt logic, CSR registers and opcodes.
/***************************************************************************/
// Remember interrupt requests as they are not checked for every cycle
reg interrupt_request_sticky;
// Interrupt enable and lock logic
wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
// Processor accepts interrupts in EXECUTE state.
wire interrupt_accepted = interrupt & state[EXECUTE_bit];
// If current interrupt is accepted, there already might be the next one,
// which should not be missed:
always @(posedge clk) begin
interrupt_request_sticky <=
interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
end
// Decoder for mret opcode
wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
// CSRs:
reg [ADDR_WIDTH-1:0] mepc; // The saved program counter.
reg [ADDR_WIDTH-1:0] mtvec; // The address of the interrupt handler.
reg mstatus; // Interrupt enable
reg mcause; // Interrupt cause (and lock)
reg [63:0] cycles; // Cycle counter
always @(posedge clk) cycles <= cycles + 1;
wire sel_mstatus = (instr[31:20] == 12'h300);
wire sel_mtvec = (instr[31:20] == 12'h305);
wire sel_mepc = (instr[31:20] == 12'h341);
wire sel_mcause = (instr[31:20] == 12'h342);
wire sel_cycles = (instr[31:20] == 12'hC00);
wire sel_cyclesh = (instr[31:20] == 12'hC80);
// Read CSRs
wire [31:0] CSR_read =
(sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
(sel_mtvec ? {ADDR_PAD, mtvec} : 32'b0) |
(sel_mepc ? {ADDR_PAD, mepc } : 32'b0) |
(sel_mcause ? {mcause, 31'b0} : 32'b0) |
(sel_cycles ? cycles[31:0] : 32'b0) |
(sel_cyclesh ? cycles[63:32] : 32'b0) ;
// Write CSRs: 5 bit unsigned immediate or content of RS1
wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read :
(instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
/* (instr[13:12] == 2'b01) ? */ CSR_modifier ;
always @(posedge clk) begin
if(!reset) begin
mstatus <= 0;
end else begin
// Execute a CSR opcode
if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
if (sel_mstatus) mstatus <= CSR_write[3];
if (sel_mtvec ) mtvec <= CSR_write[ADDR_WIDTH-1:0];
end
end
end
/***************************************************************************/
// The value written back to the register file.
/***************************************************************************/
wire [31:0] writeBackData =
(isSYSTEM ? CSR_read : 32'b0) | // SYSTEM
(isLUI ? Uimm : 32'b0) | // LUI
(isALU ? aluOut : 32'b0) | // ALUreg, ALUimm
(isFPU ? fpuOut : 32'b0) | // FPU
(isAUIPC ? {ADDR_PAD,PCplusImm} : 32'b0) | // AUIPC
(isJALR | isJAL ? {ADDR_PAD,PCinc } : 32'b0) | // JAL, JALR
(isLoad ? LOAD_data : 32'b0); // Load
/***************************************************************************/
// LOAD/STORE
/***************************************************************************/
// All memory accesses are aligned on 32 bits boundary. For this
// reason, we need some circuitry that does unaligned halfword
// and byte load/store, based on:
// - funct3[1:0]: 00->byte 01->halfword 10->word
// - mem_addr[1:0]: indicates which byte/halfword is accessed
// TODO: support unaligned accesses for FLW and FSW
// instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
wire mem_byteAccess = !instr[2] && (instr[13:12] == 2'b00);
wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01);
// LOAD, in addition to funct3[1:0], LOAD depends on:
// - funct3[2] (instr[14]): 0->do sign expansion 1->no sign expansion
wire LOAD_sign =
!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
wire [31:0] LOAD_data =
mem_byteAccess ? {{24{LOAD_sign}}, LOAD_byte} :
mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
mem_rdata ;
wire [15:0] LOAD_halfword =
loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
wire [7:0] LOAD_byte =
loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
// STORE
assign mem_wdata[ 7: 0] = rs2[7:0];
assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0] : rs2[15: 8];
assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0] : rs2[23:16];
assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0] :
loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
// The memory write mask:
// 1111 if writing a word
// 0011 or 1100 if writing a halfword
// (depending on loadstore_addr[1])
// 0001, 0010, 0100 or 1000 if writing a byte
// (depending on loadstore_addr[1:0])
wire [3:0] STORE_wmask =
mem_byteAccess ?
(loadstore_addr[1] ?
(loadstore_addr[0] ? 4'b1000 : 4'b0100) :
(loadstore_addr[0] ? 4'b0010 : 4'b0001)
) :
mem_halfwordAccess ?
(loadstore_addr[1] ? 4'b1100 : 4'b0011) :
4'b1111;
/***************************************************************************/
// Unaligned fetch mechanism and compressed opcode handling
/***************************************************************************/
reg [ADDR_WIDTH-1:2] cached_addr;
reg [31:0] cached_data;
wire current_cache_hit = cached_addr == PC [ADDR_WIDTH-1:2];
wire next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
wire current_unaligned_long = &cached_mem [17:16] & PC [1];
wire next_unaligned_long = &cached_data[17:16] & PC_new[1];
reg fetch_second_half;
reg long_instr;
wire [31:0] cached_mem = current_cache_hit ? cached_data : mem_rdata;
wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
: cached_mem;
wire [31:0] decompressed;
decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
/*************************************************************************/
// And, last but not least, the state machine.
/*************************************************************************/
localparam FETCH_INSTR_bit = 0;
localparam WAIT_INSTR_bit = 1;
localparam DECOMPRESS_GETREGS_bit = 2;
localparam EXECUTE_bit = 3;
localparam WAIT_ALU_OR_MEM_bit = 4;
localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
localparam NB_STATES = 6;
localparam FETCH_INSTR = 1 << FETCH_INSTR_bit;
localparam WAIT_INSTR = 1 << WAIT_INSTR_bit;
localparam DECOMPRESS_GETREGS = 1 << DECOMPRESS_GETREGS_bit;
localparam EXECUTE = 1 << EXECUTE_bit;
localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
(* onehot *)
reg [NB_STATES-1:0] state;
// The signals (internal and external) that are determined
// combinatorially from state and other signals.
// register write-back enable.
wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
state[EXECUTE_bit] |
state[WAIT_ALU_OR_MEM_bit] |
state[WAIT_ALU_OR_MEM_SKIP_bit]
);
// The memory-read signal.
assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
// The mask for memory-write.
assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
// aluWr starts computation (divide) in the ALU.
assign aluWr = state[EXECUTE_bit] & isALU;
wire jumpToPCplusImm = isJAL | (isBranch & predicate);
wire needToWait = isLoad |
(isStore & `NRV_IS_IO_ADDR(mem_addr)) |
isALUreg & funcM /* isDivide */ |
isFPU;
wire [ADDR_WIDTH-1:0] PC_new =
isJALR ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
jumpToPCplusImm ? PCplusImm :
interrupt_return ? mepc :
PCinc;
always @(posedge clk) begin
if(!reset) begin
state <= WAIT_ALU_OR_MEM; //Just waiting for !mem_wbusy
PC <= RESET_ADDR[ADDR_WIDTH-1:0];
mcause <= 0;
cached_addr <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
fetch_second_half <= 0;
end else begin
// See note [1] at the end of this file.
(* parallel_case *)
case(1'b1)
state[WAIT_INSTR_bit]: begin
if(!mem_rbusy) begin // may be high when executing from SPI flash
// Update cache
if (~current_cache_hit | fetch_second_half) begin
cached_addr <= mem_addr[ADDR_WIDTH-1:2];
cached_data <= mem_rdata;
end;
// Decode instruction
// Registers are fetched at the same time, in the
// FPU's always block.
instr <= &raw_instr[1:0] ? raw_instr[31:2]
: decompressed[31:2];
long_instr <= &raw_instr[1:0];
// Long opcode, unaligned, first part fetched,
// happens in non-linear code
if (current_unaligned_long & ~fetch_second_half) begin
fetch_second_half <= 1;
state <= FETCH_INSTR;
end else begin
fetch_second_half <= 0;
state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
end
end
end
state[DECOMPRESS_GETREGS_bit]: begin
// All the registers are fetched in FPU's always block.
state <= EXECUTE;
end
state[EXECUTE_bit]: begin
if (interrupt) begin
PC <= mtvec;
mepc <= PC_new;
mcause <= 1;
state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
end else begin
// Unaligned load/store not implemented yet
// (the norm supposes that FLW and FSW can handle them)
`ASSERT(
!((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]),
("PC=%x UNALIGNED FLW/FSW",PC)
);
PC <= PC_new;
if (interrupt_return) mcause <= 0;
state <= next_cache_hit & ~next_unaligned_long
? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
: (needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR);
fetch_second_half <= next_cache_hit & next_unaligned_long;
end
end
state[WAIT_ALU_OR_MEM_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= FETCH_INSTR;
end
end
state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
state <= WAIT_INSTR;
end
end
default: begin // FETCH_INSTR
state <= WAIT_INSTR;
end
endcase
end
end
`ifdef BENCH
initial begin
cycles = 0;
registerFile[0] = 0;
end
`endif
endmodule
/*****************************************************************************/
module decompressor(
input wire [15:0] c,
output reg [31:0] d
);
// Notes: * replaced illegal, unknown, x0, x1, x2 with
// 'localparam' instead of 'wire='
// * could split decoding into multiple cycles
// if decompressor is a bottleneck
// How to handle illegal and unknown opcodes
localparam illegal = 32'h0;
localparam unknown = 32'h0;
// Register decoder
wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
wire [4:0] rwl = c[ 6:2]; // Register wide low
wire [4:0] rwh = c[11:7]; // Register wide high
localparam x0 = 5'b00000;
localparam x1 = 5'b00001;
localparam x2 = 5'b00010;
// Immediate decoder
wire [4:0] shiftImm = c[6:2];
wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
wire [11:0] lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
wire [11:0] lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
wire [11:0] swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
wire [11:0] addImm = {{ 7{c[12]}}, c[6:2]};
/* verilator lint_off UNUSED */
wire [12:0] bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
wire [20:0] jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
wire [31:0] luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
/* verilator lint_on UNUSED */
always @*
casez (c[15:0])
// imm / funct7 + rs2 rs1 fn3 rd opcode
// 16'b???___????????_???_11 : d = c ; // Long opcode, no need to decompress
/* verilator lint_off CASEOVERLAP */
16'b000___00000000_000_00 : d = illegal ; // c.illegal --> illegal
16'b000___????????_???_00 : d = { addi4spnImm, x2, 3'b000, rcl, 7'b00100_11} ; // c.addi4spn --> addi rd', x2, nzuimm[9:2]
/* verilator lint_on CASEOVERLAP */
16'b010_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00000_11} ; // c.lw --> lw rd', offset[6:2](rs1')
16'b110_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01000_11} ; // c.sw --> sw rs2', offset[6:2](rs1')
16'b000_???_???_??_???_01 : d = { addImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi --> addi rd, rd, nzimm[5:0]
16'b001____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x1, 7'b11011_11} ; // c.jal --> jal x1, offset[11:1]
16'b010__?_?????_?????_01 : d = { addImm, x0, 3'b000, rwh, 7'b00100_11} ; // c.li --> addi rd, x0, imm[5:0]
16'b011__?_00010_?????_01 : d = { addi16spImm, rwh, 3'b000, rwh, 7'b00100_11} ; // c.addi16sp --> addi x2, x2, nzimm[9:4]
16'b011__?_?????_?????_01 : d = { luiImm[31:12], rwh, 7'b01101_11} ; // c.lui --> lui rd, nzuimm[17:12]
16'b100_?_00_???_?????_01 : d = { 7'b0000000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srli --> srli rd', rd', shamt[5:0]
16'b100_?_01_???_?????_01 : d = { 7'b0100000, shiftImm, rch, 3'b101, rch, 7'b00100_11} ; // c.srai --> srai rd', rd', shamt[5:0]
16'b100_?_10_???_?????_01 : d = { addImm, rch, 3'b111, rch, 7'b00100_11} ; // c.andi --> andi rd', rd', imm[5:0]
16'b100_011_???_00_???_01 : d = { 7'b0100000, rcl, rch, 3'b000, rch, 7'b01100_11} ; // c.sub --> sub rd', rd', rs2'
16'b100_011_???_01_???_01 : d = { 7'b0000000, rcl, rch, 3'b100, rch, 7'b01100_11} ; // c.xor --> xor rd', rd', rs2'
16'b100_011_???_10_???_01 : d = { 7'b0000000, rcl, rch, 3'b110, rch, 7'b01100_11} ; // c.or --> or rd', rd', rs2'
16'b100_011_???_11_???_01 : d = { 7'b0000000, rcl, rch, 3'b111, rch, 7'b01100_11} ; // c.and --> and rd', rd', rs2'
16'b101____???????????_01 : d = { jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12], x0, 7'b11011_11} ; // c.j --> jal x0, offset[11:1]
16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz --> beq rs1', x0, offset[8:1]
16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5], x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez --> bne rs1', x0, offset[8:1]
16'b000__?_?????_?????_10 : d = { 7'b0000000, shiftImm, rwh, 3'b001, rwh, 7'b00100_11} ; // c.slli --> slli rd, rd, shamt[5:0]
16'b010__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00000_11} ; // c.lwsp --> lw rd, offset[7:2](x2)
16'b100__0_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x0, 7'b11001_11} ; // c.jr --> jalr x0, rs1, 0
16'b100__0_?????_?????_10 : d = { 7'b0000000, rwl, x0, 3'b000, rwh, 7'b01100_11} ; // c.mv --> add rd, x0, rs2
// 16'b100__1_00000_00000_10 : d = { 25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak --> ebreak
16'b100__1_?????_00000_10 : d = { 12'b000000000000, rwh, 3'b000, x1, 7'b11001_11} ; // c.jalr --> jalr x1, rs1, 0
16'b100__1_?????_?????_10 : d = { 7'b0000000, rwl, rwh, 3'b000, rwh, 7'b01100_11} ; // c.add --> add rd, rd, rs2
16'b110__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01000_11} ; // c.swsp --> sw rs2, offset[7:2](x2)
// Four compressed RV32F load/store instructions
16'b011_???_???_??_???_00 : d = { lwswImm, rch, 3'b010, rcl, 7'b00001_11} ; // c.flw --> flw rd', offset[6:2](rs1')
16'b111_???_???_??_???_00 : d = { lwswImm[11:5], rcl, rch, 3'b010, lwswImm[4:0], 7'b01001_11} ; // c.fsw --> fsw rs2', offset[6:2](rs1')
16'b011__?_?????_?????_10 : d = { lwspImm, x2, 3'b010, rwh, 7'b00001_11} ; // c.flwsp --> flw rd, offset[7:2](x2)
16'b111__?_?????_?????_10 : d = { swspImm[11:5], rwl, x2, 3'b010, swspImm[4:0], 7'b01001_11} ; // c.fswsp --> fsw rs2, offset[7:2](x2)
// default: d = unknown ; // Unknown opcode
default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
endcase
endmodule
/*****************************************************************************/

856
RTL/PROCESSOR/petitbateau.v Normal file
View File

@@ -0,0 +1,856 @@
/******************************************************************************/
// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
//
// PetitBateau (make it float): a simple single-precision RISC-V FPU
// Mission statement: achieve a good area/performance ratio, by
// implementing a full-precision FMA (48 bits), and micro-programmed
// Newton-Raphson for FDIV and FSQRT (that reuse the FMA).
//
// Rounding works as follows:
// - all subnormals are flushed to zero
// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
// - FDIV and FSQRT do not have correct rounding
// if PRECISE_DIV is set (default), then FDIV rounding is validated in
// tinyraytracer test. Complete proof remains to be done
//
// [TODO] add FPU CSR (and instret for perf stat)]
// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
// [TODO] support IEEE754 denormals
// [TODO] NaNs propagation and infinity
// [TODO] support all IEEE754 rounding modes
//
// Bruno Levy, 2021
/******************************************************************************/
// TODO: instead of mux between A,B,C and FMA, make FMA always compute
// A*B+C and mux rs1,rs2,rs3,1.0,0.0 to A,B,C based on instr (mux
// will be more complicated but will probably reduce overall
// critical path) ?
// TODO: there are too many different paths between the internal registers,
// maybe micro-instructions could be redesigned with this in mind.
// A could be the MSBs of X, avoiding all MV_A_X instructions.
// TODO: the necessity to copy rs1 in E without flushing denormals for
// the int-to-fp instructions is unelegant.
// Include guard for LiteX
`ifndef PETITBATEAU_INCLUDED
`define PETITBATEAU_INCLUDED
// Check condition and display message in simulation
`ifdef BENCH
`define ASSERT(cond,msg) if(!(cond)) $display msg
`define ASSERT_NOT_REACHED(msg) $display msg
`else
`define ASSERT(cond,msg)
`define ASSERT_NOT_REACHED(msg)
`endif
module PetitBateau(
input clk,
input wr, // write strobe, starts computation
input [31:2] instr, // current riscv instruction
// operands
input [31:0] rs1,
input [31:0] rs2,
input [31:0] rs3,
// outputs
output busy,
output [31:0] out
);
// Set to 1 for higher-precision FDIV (costs 30 additional cycles per FDIV)
parameter PRECISE_DIV = 1;
// Uncomment the line below to emulate all FPU instructions in Verilator
// (useful to test instruction decoder and implementations of micro-instr
// in C++). See SIM/FPU_funcs.{h,cpp}
//`define FPU_EMUL
// Two high-resolution registers for the FMA, that computes X+Y
// Register X has the accumulator / shifters / leading zero counter
// Normalized if first bit set is bit 47
// Represented number is +/- frac * 2^(exp-127-47)
reg X_sign; reg signed [8:0] X_exp; reg signed [49:0] X_frac;
reg Y_sign; reg signed [8:0] Y_exp; reg signed [49:0] Y_frac;
// FPU output = 32 MSBs of X register (see below)
// A macro to easily write to it (`X <= ...),
// used when FPU output is an integer.
`define X {X_sign, X_exp[7:0], X_frac[46:24]}
assign out = `X;
// Five single-precision floating-point registers for internal use.
// A,B,C are wired to the FMA that computes either A*B+C or A+B
// D,E are temporaries used by FDIV and FSQRT
// Following IEEE754, represented number is +/- frac * 2^(exp-127-23)
// (127: bias 23: position of first bit set for normalized numbers)
reg A_sign; reg [7:0] A_exp; reg [23:0] A_frac;
reg B_sign; reg [7:0] B_exp; reg [23:0] B_frac;
reg C_sign; reg [7:0] C_exp; reg [23:0] C_frac;
reg D_sign; reg [7:0] D_exp; reg [23:0] D_frac;
reg E_sign; reg [7:0] E_exp; reg [23:0] E_frac;
/*************************************************************************/
// Load a 32-bit value in RD
// RD: one of A,B,C,D,E
// VAL: a 32-bit value
`define FP_LD32(RD,VAL) \
{RD``_sign, RD``_exp, RD``_frac[22:0]} <= VAL; RD``_frac[23] <= 1'b1
// Load floating point value in RD by sign, exponent, fraction
// RD: one of A,B,C,D,E
// sign: 1'b1 (-) or 1'b0 (+)
// exp: 8-bits, biased exponent
// frac: 24-bit fraction
`define FP_LD(RD,sign,eexp,frac) \
{RD``_sign, RD``_exp, RD``_frac} <= {sign,eexp,frac}
// RD <= RS
// RD,RS: one of A,B,C,D,E
`define FP_MV(RD,RS) \
{RD``_sign, RD``_exp, RD``_frac} <= {RS``_sign, RS``_exp, RS``_frac}
/** FPU micro-instructions and ROM ****************************************/
localparam FPMI_READY = 0;
localparam FPMI_LOAD_XY = 1; // X <- A; Y <- B
localparam FPMI_LOAD_XY_MUL = 2; // X <- norm(A*B); Y <- C
localparam FPMI_ADD_SWAP = 3; // if |X|>|Y| swap(X,Y);
// if sign(X) != sign(Y) X <- -X
localparam FPMI_ADD_SHIFT = 4; // shift X to match Y exponent
localparam FPMI_ADD_ADD = 5; // X <- X + Y
localparam FPMI_ADD_NORM = 6; // X <- norm(X) (after ADD_ADD)
localparam FPMI_CMP = 7; // X <- test X,Y (FEQ,FLE,FLT)
localparam FPMI_MV_A_X = 8; // A <- X
localparam FPMI_MV_B_D = 9; // B <- D
localparam FPMI_MV_B_NH_D = 10; // B <- -0.5*|D|
localparam FPMI_MV_B_E = 11; // B <- E
localparam FPMI_MV_C_A = 12; // C <- A
localparam FPMI_MV_E_X = 13; // E <- X
localparam FPMI_FRCP_PROLOG = 14; // init reciprocal (1/x)
localparam FPMI_FRCP_ITER1 = 15; // iteration for reciprocal
localparam FPMI_FRCP_ITER2 = 16; // iteration for reciprocal
localparam FPMI_FRCP_EPILOG = 17; // epilog for reciprocal
localparam FPMI_FDIV_EPILOG = 18; // epilog for fdiv IEEE-754 rounding
localparam FPMI_FRSQRT_PROLOG = 19; // init recipr sqr root (1/sqrt(x))
localparam FPMI_FP_TO_INT = 20; // fpuOut <- fpoint_to_int(A)
localparam FPMI_INT_TO_FP = 21; // X <- int_to_fpoint(X)
localparam FPMI_MIN_MAX = 22; // fpuOut <- min/max(X,Y)
localparam FPMI_LOAD_Y_ROUND = 23; // Y <- round to nearest
localparam FPMI_NB = 24;
// Instruction exit flag (if set in current micro-instr, exit microprogram)
localparam FPMI_EXIT_FLAG_bit = 1+$clog2(FPMI_NB);
localparam FPMI_EXIT_FLAG = 1 << FPMI_EXIT_FLAG_bit;
reg [6:0] fpmi_PC; // current micro-instruction pointer
reg [1+$clog2(FPMI_NB):0] fpmi_instr; // current micro-instruction
// current micro-instruction as 1-hot: fpmi_instr == NNN <=> fpmi_is[NNN]
(* onehot *)
wire [FPMI_NB-1:0] fpmi_is = 1 << fpmi_instr[$clog2(FPMI_NB):0];
initial fpmi_PC = 0;
assign busy = !fpmi_is[FPMI_READY];
// Generate a micro-instructions in ROM
task fpmi_gen; input [6:0] instr; begin
fpmi_ROM[I] = instr;
I = I + 1;
end endtask
// Generate a FMA sequence in ROM.
// Use fpmi_gen_fma(0) in the middle of a micro-program
// Use fpmi_gen_fma(FPMI_EXIT_FLAG) if last instruction of micro-program
task fpmi_gen_fma; input [6:0] flags; begin
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- norm(A*B), Y <- C
fpmi_gen(FPMI_ADD_SWAP); // if(|X| > |Y|) swap(X,Y) (and sgn)
fpmi_gen(FPMI_ADD_SHIFT); // shift X according to Y exp
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
fpmi_gen(FPMI_ADD_NORM | flags); // X <- normalize(X)
end endtask
integer I; // current ROM location in initialization
integer iter; // iteration variable for generate Newton-Raphson (FDIV,FSQRT)
localparam FPMI_ROM_SIZE=82 + (12 + 18)*PRECISE_DIV;
reg [1+$clog2(FPMI_NB):0] fpmi_ROM[0:FPMI_ROM_SIZE-1];
// Microprograms start addresses
// Programatically determined when generating the ROM ('initial' block below)
integer FPMPROG_CMP, FPMPROG_ADD, FPMPROG_MUL, FPMPROG_MADD, FPMPROG_DIV;
integer FPMPROG_FP_TO_INT, FPMPROG_INT_TO_FP, FPMPROG_SQRT, FPMPROG_MIN_MAX;
// Start the definition of a microprogram (determines start address)
`define FPMPROG_BEGIN(prg) prg = I
// Ends the definition of a microprogram (displays stats in Verilator)
`ifdef BENCH
`define FPMPROG_END(prg) \
$display("# %3d microinstructions used by %d:%s",I-prg,prg,`"prg`")
`else
`define FPMPROG_END(prg)
`endif
/******************** Generate microprograms in ROM **********************/
initial begin
`ifdef BENCH
$display("# Generating FPMI ROM...");
`endif
I = 0;
fpmi_gen(FPMI_READY | FPMI_EXIT_FLAG);
// ******************** FLT, FLE, FEQ *********************************
`FPMPROG_BEGIN(FPMPROG_CMP);
fpmi_gen(FPMI_LOAD_XY); // X <- A, Y <- B
fpmi_gen(FPMI_CMP | FPMI_EXIT_FLAG); // X <- compare(X,Y)
`FPMPROG_END(FPMPROG_CMP);
// ******************** FADD, FSUB ************************************
`FPMPROG_BEGIN(FPMPROG_ADD);
fpmi_gen(FPMI_LOAD_XY); // X <- A, Y <- B
fpmi_gen(FPMI_ADD_SWAP); // if(|X| > |Y|) swap(X,Y) (,sgn)
fpmi_gen(FPMI_ADD_SHIFT); // shift X according to Y exp
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
`FPMPROG_END(FPMPROG_ADD);
// ******************** FMUL ******************************************
`FPMPROG_BEGIN(FPMPROG_MUL);
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
`FPMPROG_END(FPMPROG_MUL);
// ******************** FMADD, FMSUB, FNMADD, FNMSUB ******************
`FPMPROG_BEGIN(FPMPROG_MADD);
fpmi_gen_fma(FPMI_EXIT_FLAG); // X <- A*B+C (5 cycles)
`FPMPROG_END(FPMPROG_MADD);
// ******************** FDIV ******************************************
// https://en.wikipedia.org/wiki/Division_algorithm
// https://stackoverflow.com/questions/24792966/
// error-using-newton-raphson-iteration-method-for-
// floating-point-division
//
`FPMPROG_BEGIN(FPMPROG_DIV);
// D' = denominator (rs2) normalized between [0.5,1] (set exp to 126)
fpmi_gen(FPMI_FRCP_PROLOG); // D<-A; E<-B; A<-(-D'); B<-32/17; C<-48/17
fpmi_gen_fma(0); // X <- A*B+C (= -D'*32/17 + 48/17)
for(iter=0; iter<3; iter=iter+1) begin
if(PRECISE_DIV) begin
// X <- X + X*(1-D'*X)
// (slower more precise iter, but not IEEE754 compliant yet...)
fpmi_gen(FPMI_FRCP_ITER1); // A <- -D'; B <- X; C <- 1.0f
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
fpmi_gen(FPMI_FRCP_ITER2); // A <- X; C <- B
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
end else begin
// X <- X * (-X*D' + 2)
// (faster but less precise)
fpmi_gen(FPMI_FRCP_ITER1); // A <- -D'; B <- X; C <- 2.0f
fpmi_gen_fma(0); // X <- A*B+C (5 cycles)
fpmi_gen(FPMI_MV_A_X); // A <- X
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
end
end
if(PRECISE_DIV) begin // round X to nearest
fpmi_gen(FPMI_LOAD_Y_ROUND);
fpmi_gen(FPMI_ADD_ADD);
fpmi_gen(FPMI_ADD_NORM);
end
fpmi_gen(FPMI_FRCP_EPILOG); // A <- (E_sign,frcp_exp,X_frac); B <- D
if(PRECISE_DIV) begin // error correction
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B
fpmi_gen(FPMI_FDIV_EPILOG); // B <- -E; C <- D; D <- A
fpmi_gen(FPMI_MV_A_X);
fpmi_gen_fma(0);
fpmi_gen(FPMI_MV_C_A);
fpmi_gen(FPMI_MV_B_D);
fpmi_gen(FPMI_MV_A_X);
fpmi_gen_fma(FPMI_EXIT_FLAG);
end else begin
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
end
`FPMPROG_END(FPMPROG_DIV);
// ******************** FCVT.W.S, FCVT.WU.S ***************************
`FPMPROG_BEGIN(FPMPROG_FP_TO_INT);
fpmi_gen(FPMI_LOAD_XY);
fpmi_gen(FPMI_FP_TO_INT | FPMI_EXIT_FLAG);
`FPMPROG_END(FPMPROG_FP_TO_INT);
// ******************** FCVT.S.W, FCVT.S.WU ***************************
`FPMPROG_BEGIN(FPMPROG_INT_TO_FP); // Compute A+0 (use CLZ plugged on X)
fpmi_gen(FPMI_INT_TO_FP); // X <- 0; Y <- A
fpmi_gen(FPMI_ADD_ADD); // X <- X + Y
fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
`FPMPROG_END(FPMPROG_INT_TO_FP);
// ******************** FSQRT *****************************************
// Using Doom's fast inverse square root algorithm:
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
// http://www.lomont.org/papers/2003/InvSqrt.pdf
// TODO: IEEE754-compliant version
// See https://t.co/V1SWQ6N6xD?amp=1 (Method of Switching Constants)
// See simple effective fast inverse square root with two magic
// constants.
//
`FPMPROG_BEGIN(FPMPROG_SQRT);
// D<-rs1; E,A,B<-(doom_magic - (A >> 1)); C<-3/2
fpmi_gen(FPMI_FRSQRT_PROLOG);
for(iter=0; iter<2; iter=iter+1) begin
// X <- X * (3/2 - (0.5*rs1*X*X))
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
fpmi_gen(FPMI_MV_A_X); // A <- X
fpmi_gen(FPMI_MV_B_NH_D); // B <- -0.5*|D|
fpmi_gen_fma(0); // X <- A*B+C
fpmi_gen(FPMI_MV_A_X); // A <- X
fpmi_gen(FPMI_MV_B_E); // B <- E
fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
if(iter==0) begin
fpmi_gen(FPMI_MV_E_X); // E <- X
fpmi_gen(FPMI_MV_A_X); // A <- X
fpmi_gen(FPMI_MV_B_E); // B <- E
end
end // X contains 1/sqrt(rs1), now compute rs1*X to get sqrt(rs1)
fpmi_gen(FPMI_MV_A_X); // A <- X
fpmi_gen(FPMI_MV_B_D); // B <- D
fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B; Y <- C
`FPMPROG_END(FPMPROG_SQRT);
// ******************** FMIN, FMAX ************************************
`FPMPROG_BEGIN(FPMPROG_MIN_MAX);
fpmi_gen(FPMI_LOAD_XY);
fpmi_gen(FPMI_MIN_MAX | FPMI_EXIT_FLAG);
`FPMPROG_END(FPMPROG_MIN_MAX);
`ifdef BENCH
$display("# FPMI ROM max address:%d",I-1);
$display("# FPMI ROM size :%d",FPMI_ROM_SIZE);
`ASSERT(I <= FPMI_ROM_SIZE,("!!!!!!! FPMI ROM SIZE exceeded !!!!!!!"));
`endif
end
`ifndef FPU_EMUL
// determine microprogram to be called based on decoded instruction
reg [6:0] fpmprog;
always @(*) begin
(* parallel_case, full_case *)
case(1'b1)
isFLT | isFLE | isFEQ : fpmprog = FPMPROG_CMP[6:0];
isFADD | isFSUB : fpmprog = FPMPROG_ADD[6:0];
isFMUL : fpmprog = FPMPROG_MUL[6:0];
isFMADD | isFMSUB | isFNMADD | isFNMSUB : fpmprog = FPMPROG_MADD[6:0];
isFDIV : fpmprog = FPMPROG_DIV[6:0];
isFSQRT : fpmprog = FPMPROG_SQRT[6:0];
isFCVTWS | isFCVTWUS : fpmprog = FPMPROG_FP_TO_INT[6:0];
isFCVTSW | isFCVTSWU : fpmprog = FPMPROG_INT_TO_FP[6:0];
isFMIN | isFMAX : fpmprog = FPMPROG_MIN_MAX[6:0];
default : fpmprog = 0;
endcase
end
// next micro-instruction program counter
wire [6:0] fpmi_PC_next =
wr ? fpmprog :
fpmi_instr[FPMI_EXIT_FLAG_bit] ? 0 :
fpmi_PC+1 ;
always @(posedge clk) begin
fpmi_PC <= fpmi_PC_next;
fpmi_instr <= fpmi_ROM[fpmi_PC_next];
end
always @(posedge clk) begin
if(wr) begin
// Denormals are flushed to zero
`FP_LD(A, rs1[31], rs1[30:23], (|rs1[30:23]?{1'b1,rs1[22:0]}:24'b0));
`FP_LD(B, rs2[31], rs2[30:23], (|rs2[30:23]?{1'b1,rs2[22:0]}:24'b0));
`FP_LD(C, rs3[31], rs3[30:23], (|rs3[30:23]?{1'b1,rs3[22:0]}:24'b0));
// Backup rs1 in E without flushing to zero (for int2fp instructions)
`FP_LD32(E, rs1);
// Single-cycle instructions
(* parallel_case *)
case(1'b1)
isFSGNJ : `X <= { rs2[31], rs1[30:0]};
isFSGNJN : `X <= { !rs2[31], rs1[30:0]};
isFSGNJX : `X <= { rs1[31]^rs2[31], rs1[30:0]};
isFCLASS : `X <= fclass;
isFMVXW | isFMVWX : `X <= rs1;
endcase
end else if(busy) begin
// Implementation of the micro-instructions
(* parallel_case *)
case(1'b1)
// X <- A ; Y <- B
fpmi_is[FPMI_LOAD_XY]: begin
X_sign <= A_sign;
X_frac <= {2'b0, A_frac, 24'd0};
X_exp <= {1'b0, A_exp};
Y_sign <= B_sign ^ isFSUB;
Y_frac <= {2'b0, B_frac, 24'd0};
Y_exp <= {1'b0, B_exp};
end
// X <- (+/-) normalize(A*B); Y <- (+/-)C
fpmi_is[FPMI_LOAD_XY_MUL]: begin
X_sign <= A_sign ^ B_sign ^ (isFNMSUB | isFNMADD);
X_frac <= prod_Z ? 0 :
(prod_frac[47] ? prod_frac : {prod_frac[48:0],1'b0});
X_exp <= prod_Z ? 0 : prod_exp_norm;
Y_sign <= C_sign ^ (isFMSUB | isFNMADD);
Y_frac <= {2'b0, C_frac, 24'd0};
Y_exp <= {1'b0, C_exp};
end
// if(|X| > |Y|) swap(X,Y)
// if X_sign != Y_sign X <- -X
// We always *add*, but replace X_frac with -X_frac if the
// sign of the operands differ, THEN we shift (signed shift). In
// this way, rounding is correct, even when subtracting a
// low magnitude numner from a large magnitude one.
fpmi_is[FPMI_ADD_SWAP]: begin
if(fabsY_LT_fabsX) begin
X_frac <= (X_sign ^ Y_sign) ? -Y_frac : Y_frac;
Y_frac <= X_frac;
X_exp <= Y_exp; Y_exp <= X_exp;
X_sign <= Y_sign; Y_sign <= X_sign;
end else if(X_sign ^ Y_sign) begin
X_frac <= -X_frac;
end
end
// shift A in order to make it match B exponent
fpmi_is[FPMI_ADD_SHIFT]: begin
`ASSERT(!fabsY_LT_fabsX, ("ADD_SHIFT: incorrect order"));
X_frac <= X_frac >>> exp_diff; // note the signed shift !
X_exp <= Y_exp;
end
// A <- A (+/-) B
fpmi_is[FPMI_ADD_ADD]: begin
X_frac <= frac_sum[49:0];
X_sign <= Y_sign;
// normalization left shamt = 47 - first_bit_set = clz - 16
norm_lshamt <= frac_sum_clz - 16;
// Exponent of X once normalized = X_exp + first_bit_set - 47
// = X_exp + 63 - clz - 47 = X_exp + 16 - clz
X_exp_norm <= X_exp + 16 - {3'b000,frac_sum_clz};
end
// X <- normalize(X) (after ADD_ADD -> norm_lshamt and A_exp_norm)
fpmi_is[FPMI_ADD_NORM]: begin
if(X_exp_norm <= 0 || (X_frac == 0)) begin
X_frac <= 0;
X_exp <= 0;
end else begin
X_frac <= X_frac[48] ? (X_frac >> 1) : X_frac << norm_lshamt;
X_exp <= X_exp_norm;
end
end
fpmi_is[FPMI_LOAD_Y_ROUND]: begin
Y_sign <= X_sign;
Y_exp <= X_exp;
Y_frac <= X_frac[23] ? (1 << 24) : 50'd0;
end
// X <- result of comparison between X and Y
fpmi_is[FPMI_CMP]: begin
`X <= { 31'b0,
isFLT && X_LT_Y ||
isFLE && X_LE_Y ||
isFEQ && X_EQ_Y
};
end
fpmi_is[FPMI_MV_B_D] : `FP_MV(B,D);
fpmi_is[FPMI_MV_B_E] : `FP_MV(B,E);
fpmi_is[FPMI_MV_A_X] : `FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]);
fpmi_is[FPMI_MV_C_A] : `FP_MV(C,A);
fpmi_is[FPMI_MV_E_X] : `FP_LD(E,X_sign,X_exp[7:0],X_frac[47:24]);
// B <= -|D| / 2.0
fpmi_is[FPMI_MV_B_NH_D]:
{B_sign, B_exp, B_frac} <= {1'b1,D_exp-8'd1,D_frac};
fpmi_is[FPMI_FRCP_PROLOG]: begin
`FP_MV(D,A);
`FP_MV(E,B);
// A <= -D', that is, -(B normalized in [0.5,1])
`FP_LD(A,1'b1,8'd126, B_frac);
`FP_LD32(B, 32'h3FF0F0F1); // 32/17
`FP_LD32(C, 32'h4034B4B5); // 48/17
end
fpmi_is[FPMI_FRCP_ITER1]: begin
`FP_LD(A,1'b1,8'd126, E_frac); // A <= -D'
`FP_LD(B,X_sign,X_exp[7:0],X_frac[47:24]); // B <= X
// 1.0 2.0
`FP_LD32(C, PRECISE_DIV ? 32'h3f800000 : 32'h40000000);
end
// This one is used only if PRECISE_DIV is set
fpmi_is[FPMI_FRCP_ITER2]: begin
`FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]); // A <= X
`FP_MV(C,B);
end
fpmi_is[FPMI_FRCP_EPILOG]: begin
`FP_LD(A,E_sign,frcp_exp[7:0],X_frac[47:24]);
`FP_MV(B,D);
end
// This one is used only if PRECISE_DIV is set
fpmi_is[FPMI_FDIV_EPILOG]: begin
`FP_LD(B,!E_sign, E_exp, E_frac); // B <= -E
`FP_MV(C,D);
`FP_MV(D,A);
end
fpmi_is[FPMI_FRSQRT_PROLOG]: begin
`FP_LD32(D, rs1);
`FP_LD32(E, rsqrt_doom_magic);
`FP_LD32(A, rsqrt_doom_magic);
`FP_LD32(B, rsqrt_doom_magic);
`FP_LD32(C, 32'h3fc00000); // 1.5
end
fpmi_is[FPMI_FP_TO_INT]: begin
// TODO: check overflow
`X <=
(isFCVTWUS | !X_sign) ? X_fcvt_ftoi_shifted
: -$signed(X_fcvt_ftoi_shifted);
end
fpmi_is[FPMI_INT_TO_FP]: begin
// TODO: rounding
// We do a fake addition with zero, to prepare normalization
// (uses CLZ plugged on the adder).
X_frac <= 0;
// 127+23: standard exponent bias
// +6 because it is bit 29 of rs1 that overwrites
// bit 47 of A_frac, instead of bit 23 (and 29-23 = 6).
X_exp <= 127+23+6;
Y_frac <=
(isFCVTSWU | !E_sign) ? {E_sign, E_exp, E_frac[22:0], 18'd0}
: {-$signed({E_sign, E_exp, E_frac[22:0]}), 18'd0};
Y_sign <= isFCVTSW & E_sign;
end
fpmi_is[FPMI_MIN_MAX]: begin
`X <= (X_LT_Y ^ isFMAX)
? {X_sign, X_exp[7:0], X_frac[46:24]}
: {Y_sign, Y_exp[7:0], Y_frac[46:24]};
end
endcase
end
end
`endif
// Some circuitry used by the FPU micro-instructions:
// ******************* Comparisons ******************************************
// Exponent adder
wire signed [8:0] exp_sum = Y_exp + X_exp;
wire signed [8:0] exp_diff = Y_exp - X_exp;
wire expX_EQ_expY = (exp_diff == 0);
wire fracX_EQ_fracY = (frac_diff == 0);
wire fabsX_EQ_fabsY = (expX_EQ_expY && fracX_EQ_fracY);
wire fabsX_LT_fabsY = (!exp_diff[8] && !expX_EQ_expY) ||
(expX_EQ_expY && !fracX_EQ_fracY && !frac_diff[50]);
wire fabsX_LE_fabsY = (!exp_diff[8] && !expX_EQ_expY) ||
(expX_EQ_expY && !frac_diff[50]);
wire fabsY_LT_fabsX = exp_diff[8] || (expX_EQ_expY && frac_diff[50]);
wire fabsY_LE_fabsX = exp_diff[8] ||
(expX_EQ_expY && (frac_diff[50] || fracX_EQ_fracY));
wire X_LT_Y = X_sign && !Y_sign ||
X_sign && Y_sign && fabsY_LT_fabsX ||
!X_sign && !Y_sign && fabsX_LT_fabsY ;
wire X_LE_Y = X_sign && !Y_sign ||
X_sign && Y_sign && fabsY_LE_fabsX ||
!X_sign && !Y_sign && fabsX_LE_fabsY ;
wire X_EQ_Y = fabsX_EQ_fabsY && (X_sign == Y_sign);
// ****************** Addition, subtraction *********************************
wire signed [50:0] frac_sum = Y_frac + X_frac;
wire signed [50:0] frac_diff = Y_frac - X_frac;
// ****************** Product ***********************************************
wire [49:0] prod_frac = A_frac * B_frac; // TODO: check overflows
// exponent of product, once normalized
// (obtained by writing expression of product and inspecting exponent)
// Two cases: first bit set = 47 or 46 (only possible cases with normals)
wire signed [8:0] prod_exp_norm = A_exp+B_exp-127+{7'b0,prod_frac[47]};
// detect null product and underflows (all denormals are flushed to zero)
wire prod_Z = (prod_exp_norm <= 0) || !(|prod_frac[47:46]);
// ****************** Normalization *****************************************
// Count leading zeroes in A+B
// Note1: CLZ only work with power of two width (hence 13'b0 padding).
// Note2: first bit set = 63 - CLZ (of course !)
wire [5:0] frac_sum_clz;
CLZ clz2({13'b0,frac_sum}, frac_sum_clz);
reg [5:0] norm_lshamt; // shift amount for ADD normalization
// Exponent of A once normalized = X_exp + first_bit_set - 47
// = X_exp + 63 - clz - 47 = X_exp + 16 - clz
// X_exp_norm <= X_exp + 16 - {3'b000,A_clz};
reg signed [8:0] X_exp_norm;
// ****************** Reciprocal (1/x), used by FDIV ************************
// Exponent for reciprocal (1/x)
// Initial value of x kept in E.
wire signed [8:0] frcp_exp = 9'd126 + X_exp - $signed({1'b0, E_exp});
// ****************** Reciprocal square root (1/sqrt(x)) ********************
// https://en.wikipedia.org/wiki/Fast_inverse_square_root
wire [31:0] rsqrt_doom_magic = 32'h5f3759df - {1'b0,A_exp, A_frac[22:1]};
// ****************** Float to Integer conversion ***************************
// -127-23 is standard exponent bias
// -6 because it is bit 29 of X that corresponds to bit 47 of X_frac,
// instead of bit 23 (and 23-29 = -6).
wire signed [8:0] fcvt_ftoi_shift = A_exp - 9'd127 - 9'd23 - 9'd6;
wire signed [8:0] neg_fcvt_ftoi_shift = -fcvt_ftoi_shift;
wire [31:0] X_fcvt_ftoi_shifted = fcvt_ftoi_shift[8] ? // R or L shift
(|neg_fcvt_ftoi_shift[8:5] ? 0 : // underflow
({X_frac[49:18]} >> neg_fcvt_ftoi_shift[4:0])) :
({X_frac[49:18]} << fcvt_ftoi_shift[4:0]);
// ******************* Classification ***************************************
wire rs1_exp_Z = (rs1[30:23] == 0 );
wire rs1_exp_255 = (rs1[30:23] == 255);
wire rs1_frac_Z = (rs1[22:0] == 0 );
wire [31:0] fclass = {
22'b0,
rs1_exp_255 & rs1[22], // 9: quiet NaN
rs1_exp_255 & !rs1[22] & (|rs1[21:0]), // 8: sig NaN
!rs1[31] & rs1_exp_255 & rs1_frac_Z, // 7: +infinity
!rs1[31] & !rs1_exp_Z & !rs1_exp_255, // 6: +normal
!rs1[31] & rs1_exp_Z & !rs1_frac_Z, // 5: +subnormal
!rs1[31] & rs1_exp_Z & rs1_frac_Z, // 4: +0
rs1[31] & rs1_exp_Z & rs1_frac_Z, // 3: -0
rs1[31] & rs1_exp_Z & !rs1_frac_Z, // 2: -subnormal
rs1[31] & !rs1_exp_Z & !rs1_exp_255, // 1: -normal
rs1[31] & rs1_exp_255 & rs1_frac_Z // 0: -infinity
};
/************************************************************************/
// RV32F instruction decoder
// See table p133 (RV32G instruction listings)
// Notes:
// - FLW/FSW handled by LOAD/STORE in femtorv32 (instr[2] set if FLW/FSW)
// - For all other F instructions, instr[6:5] == 2'b10
// - FMADD/FMSUB/FNMADD/FNMSUB: instr[4] = 1'b0
// - For all remaining F instructions, instr[4] = 1'b1
// - FMV.X.W and FCLASS have same funct7 (7'b1110000),
// (discriminated by instr[12])
// - there is a big gotcha in the official doc for RV32F:
// the doc says FNMADD computes -rs1*rs2-rs3
// (yes, with *minus* rs3)
// it should have said FNMADD computes -(rs1*rs2+rs3)
// and FNMSUB compures -(rs1*rs2-rs3)
// they probably did not put the parentheses because when
// you implement it, you change the sign of rs1 and rs3 according
// to the operation rather than the sign of the whole result
// (here, it is done by the FPMI_LOAD_XY_MUL micro instruction).
reg isFMADD, isFMSUB, isFNMSUB, isFNMADD;
reg isFADD, isFSUB, isFMUL, isFDIV, isFSQRT;
reg isFSGNJ, isFSGNJN, isFSGNJX;
reg isFMIN, isFMAX;
reg isFEQ, isFLT, isFLE;
reg isFCLASS, isFCVTWS, isFCVTWUS;
reg isFCVTSW, isFCVTSWU;
reg isFMVXW, isFMVWX;
always @(*) begin
isFMADD = (instr[4:2] == 3'b000); // rd <- rs1*rs2+rs3
isFMSUB = (instr[4:2] == 3'b001); // rd <- rs1*rs2-rs3
isFNMSUB = (instr[4:2] == 3'b010); // rd <- -(rs1*rs2-rs3)
isFNMADD = (instr[4:2] == 3'b011); // rd <- -(rs1*rs2+rs3)
isFADD = (instr[4] && (instr[31:27] == 5'b00000));
isFSUB = (instr[4] && (instr[31:27] == 5'b00001));
isFMUL = (instr[4] && (instr[31:27] == 5'b00010));
isFDIV = (instr[4] && (instr[31:27] == 5'b00011));
isFSQRT = (instr[4] && (instr[31:27] == 5'b01011));
isFSGNJ = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b00));
isFSGNJN = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b01));
isFSGNJX = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b10));
isFMIN = (instr[4] && (instr[31:27] == 5'b00101) && !instr[12]);
isFMAX = (instr[4] && (instr[31:27] == 5'b00101) && instr[12]);
isFEQ =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b10));
isFLT =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b01));
isFLE =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b00));
isFCLASS = (instr[4] && (instr[31:27] == 5'b11100) && instr[12]);
isFCVTWS = (instr[4] && (instr[31:27] == 5'b11000) && !instr[20]);
isFCVTWUS = (instr[4] && (instr[31:27] == 5'b11000) && instr[20]);
isFCVTSW = (instr[4] && (instr[31:27] == 5'b11010) && !instr[20]);
isFCVTSWU = (instr[4] && (instr[31:27] == 5'b11010) && instr[20]);
isFMVXW = (instr[4] && (instr[31:27] == 5'b11100) && !instr[12]);
isFMVWX = (instr[4] && (instr[31:27] == 5'b11110));
end
`ifdef FPU_EMUL
`define FPU_EMUL1(op) `X <= $c32(op,"(",rs1,")")
`define FPU_EMUL2(op) `X <= $c32(op,"(",rs1,",",rs2,")")
`define FPU_EMUL3(op) `X <= $c32(op,"(",rs1,",",rs2,",",rs3,")")
always @(posedge clk) begin
if(wr) begin
(* parallel_case *)
case(1'b1)
isFMUL : `FPU_EMUL2("FMUL");
isFADD : `FPU_EMUL2("FADD");
isFSUB : `FPU_EMUL2("FSUB");
isFDIV : `FPU_EMUL2("FDIV");
isFSQRT : `FPU_EMUL1("FSQRT");
isFMADD : `FPU_EMUL3("FMADD");
isFMSUB : `FPU_EMUL3("FMSUB");
isFNMADD : `FPU_EMUL3("FNMADD");
isFNMSUB : `FPU_EMUL3("FNMSUB");
isFEQ : `FPU_EMUL2("FEQ");
isFLT : `FPU_EMUL2("FLT");
isFLE : `FPU_EMUL2("FLE");
isFCVTWS : `FPU_EMUL1("FCVTWS");
isFCVTWUS: `FPU_EMUL1("FCVTWUS");
isFCVTSW : `FPU_EMUL1("FCVTSW");
isFCVTSWU: `FPU_EMUL1("FCVTSWU");
isFMIN : `FPU_EMUL2("FMIN");
isFMAX : `FPU_EMUL2("FMAX");
isFCLASS : `FPU_EMUL1("FCLASS");
isFSGNJ : `FPU_EMUL2("FSGNJ");
isFSGNJN : `FPU_EMUL2("FSGNJN");
isFSGNJX : `FPU_EMUL2("FSGNJX");
isFMVXW | isFMVWX : `X <= rs1;
endcase
end
end
`endif
/****************************************************************************/
// When doing simulations, compare the result of all operations with
// what's computed on the host CPU.
// Note: my FDIV and FSQRT are not IEEE754 compliant (yet) !
// (checks commented-out for now)
`ifdef NRV_FEMTORV32_PETITBATEAU // makes sure we are in the learn-FPGA fmwk
`ifdef VERILATOR
`define FPU_CHECK1(op) \
z <= $c32("CHECK_",op,"(",`X,",",rs1,")")
`define FPU_CHECK2(op) \
z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,")")
`define FPU_CHECK3(op) \
z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,",",rs3,")")
reg [31:0] z;
reg active;
always @(posedge clk) begin
if(wr) begin
active <= 1'b1;
end
if(active && !busy) begin
active <= 1'b0;
case(1'b1)
isFMUL : `FPU_CHECK2("FMUL");
isFADD : `FPU_CHECK2("FADD");
isFSUB : `FPU_CHECK2("FSUB");
isFDIV : `FPU_CHECK2("FDIV");
// isFSQRT: `FPU_CHECK1("FSQRT"); // yes I know, not IEEE754 yet
isFMADD: `FPU_CHECK3("FMADD");
isFMSUB: `FPU_CHECK3("FMSUB");
isFNMADD: `FPU_CHECK3("FNMADD");
isFNMSUB: `FPU_CHECK3("FNMSUB");
isFEQ: `FPU_CHECK2("FEQ");
isFLT: `FPU_CHECK2("FLT");
isFLE: `FPU_CHECK2("FLE");
isFCVTWS : `FPU_CHECK1("FCVTWS");
isFCVTWUS: `FPU_CHECK1("FCVTWUS");
isFCVTSW : `FPU_CHECK1("FCVTSW");
isFCVTSWU: `FPU_CHECK1("FCVTSWU");
isFMIN: `FPU_CHECK2("FMIN");
isFMAX: `FPU_CHECK2("FMAX");
endcase
end
end
`endif
`endif
endmodule
/**********************************************************************/
// FPU Normalization needs to detect the position of the first bit set
// in the A_frac register. It is easier to count the number of leading
// zeroes (CLZ for Count Leading Zeroes), as follows. See:
// https://electronics.stackexchange.com/questions/196914/
// verilog-synthesize-high-speed-leading-zero-count
// TODO: test also Dean Gaudet's algorithm (see Hackers Delights p. 110)
module CLZ #(
parameter W_IN = 64, // must be power of 2, >= 2
parameter W_OUT = $clog2(W_IN)
) (
input wire [W_IN-1:0] in,
output wire [W_OUT-1:0] out
);
generate
if(W_IN == 2) begin
assign out = !in[1];
end else begin
wire [W_OUT-2:0] half_count;
wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
wire [W_IN/2-1:0] rhs = in[0 +: W_IN/2];
wire left_empty = ~|lhs;
CLZ #(
.W_IN(W_IN/2)
) inner(
.in(left_empty ? rhs : lhs),
.out(half_count)
);
assign out = {left_empty, half_count};
end
endgenerate
endmodule
`endif

22
RTL/PROCESSOR/utils.v Normal file
View File

@@ -0,0 +1,22 @@
/********************* Utilities, macros for debugging *************/
`ifdef VERBOSE
`define verbose(command) command
`else
`define verbose(command)
`endif
`ifdef BENCH
`define BENCH_OR_LINT
`ifdef QUIET
`define bench(command)
`else
`define bench(command) command
`endif
`else
`define bench(command)
`endif
`ifdef verilator
`define BENCH_OR_LINT
`endif

330
RTL/SDRAM/muchtoremember.v Normal file
View File

@@ -0,0 +1,330 @@
// SDRAM interface to AS4C32M16SB-7TCN
// 512 Mbit Single-Data-Rate SDRAM, 32Mx16 (8M x 16 x 4 Banks)
// Matthias Koch, January 2022
// With a lot of inspiration from Mike Field, Hamsterworks:
// https://web.archive.org/web/20190215130043/http://hamsterworks.co.nz/mediawiki/index.php/Simple_SDRAM_Controller
// https://web.archive.org/web/20190215130043/http://hamsterworks.co.nz/mediawiki/index.php/File:Verilog_Memory_controller_v0.1.zip
// Note: You may need to change all values marked with *** when changing clock frequency. This is for 40 MHz.
module muchtoremember (
// Interface to SDRAM chip, fully registered
output sd_clk, // Clock for SDRAM chip
output reg sd_cke, // Clock enabled
inout [15:0] sd_d, // Bidirectional data lines to/from SDRAM
output reg [12:0] sd_addr, // Address bus, multiplexed, 13 bits
output reg [1:0] sd_ba, // Bank select wires for 4 banks
output reg [1:0] sd_dqm, // Byte mask
output reg sd_cs, // Chip select
output reg sd_we, // Write enable
output reg sd_ras, // Row address select
output reg sd_cas, // Columns address select
// Interface to processor
input clk,
input resetn,
input [3:0] wmask,
input rd,
input [25:0] addr,
input [31:0] din,
output reg [31:0] dout,
output reg busy
);
parameter sdram_startup_cycles = 10100; // *** -- 100us, plus a little more, @ 100MHz
parameter sdram_refresh_cycles = 195; // *** The refresh operation must be performed 8192 times within 64ms. --> One refresh every 7.8125 us.
// With a minimum clock of 25 MHz, this results in one refresh every 7.8125e-6 * 25e6 = 195 cycles.
// ----------------------------------------------------------
// -- Connections and buffer primitives
// ----------------------------------------------------------
assign sd_clk = ~clk; // Supply memory chip with a clock.
wire [15:0] sd_data_in; // Bidirectional data from SDRAM
reg [15:0] sd_data_out; // Bidirectional data to SDRAM
reg sd_data_drive; // High: FPGA controls wires Low: SDRAM controls wires
`ifdef __ICARUS__
reg [15:0] sd_data_in_buffered;
assign sd_d = sd_data_drive ? sd_data_out : 16'bz;
always @(posedge clk) sd_data_in_buffered <= sd_d;
assign sd_data_in = sd_data_in_buffered;
`else
wire [15:0] sd_data_in_unbuffered; // To connect primitives internally
TRELLIS_IO #(.DIR("BIDIR"))
sdio_tristate[15:0] (
.B(sd_d),
.I(sd_data_out),
.O(sd_data_in_unbuffered),
.T(!sd_data_drive)
);
// Registering the input is important for stability and delays data arrival by one clock cycle.
IFS1P3BX dbi_ff[15:0] (.D(sd_data_in_unbuffered), .Q(sd_data_in), .SCLK(clk), .PD({16{sd_data_drive}}));
`endif
// ----------------------------------------------------------
// -- Configuration to initialise the SDRAM chip
// ----------------------------------------------------------
// Taken from https://github.com/rxrbln/picorv32/blob/master/picosoc/sdram.v
localparam NO_WRITE_BURST = 1'b0; // 0=write burst enabled, 1=only single access write
localparam OP_MODE = 2'b00; // only 00 (standard operation) allowed
localparam CAS_LATENCY = 3'd2; // 2 or 3 cycles allowed
localparam ACCESS_TYPE = 1'b0; // 0=sequential, 1=interleaved
localparam BURST_LENGTH = 3'b001; // 000=1, 001=2, 010=4, 011=8
localparam MODE = {3'b000, NO_WRITE_BURST, OP_MODE, CAS_LATENCY, ACCESS_TYPE, BURST_LENGTH};
// ----------------------------------------------------------
// -- All possible commands for the SDRAM chip
// ----------------------------------------------------------
// CS, RAS, CAS, WE
localparam CMD_INHIBIT = 4'b1111;
localparam CMD_NOP = 4'b0111;
localparam CMD_BURST_TERMINATE = 4'b0110;
localparam CMD_READ = 4'b0101;
localparam CMD_WRITE = 4'b0100;
localparam CMD_ACTIVE = 4'b0011;
localparam CMD_PRECHARGE = 4'b0010;
localparam CMD_AUTO_REFRESH = 4'b0001;
localparam CMD_LOAD_MODE = 4'b0000;
// ----------------------------------------------------------
// -- States of the SDRAM controller
// ----------------------------------------------------------
localparam s_init_bit = 0; localparam s_init = 1 << s_init_bit ;
localparam s_idle_bit = 1; localparam s_idle = 1 << s_idle_bit ;
localparam s_activate_bit = 2; localparam s_activate = 1 << s_activate_bit ;
localparam s_read_1_bit = 3; localparam s_read_1 = 1 << s_read_1_bit ;
localparam s_read_2_bit = 4; localparam s_read_2 = 1 << s_read_2_bit ;
localparam s_read_3_bit = 5; localparam s_read_3 = 1 << s_read_3_bit ;
localparam s_read_4_bit = 6; localparam s_read_4 = 1 << s_read_4_bit ;
localparam s_read_5_bit = 7; localparam s_read_5 = 1 << s_read_5_bit ;
localparam s_write_1_bit = 8; localparam s_write_1 = 1 << s_write_1_bit ;
localparam s_write_2_bit = 9; localparam s_write_2 = 1 << s_write_2_bit ;
localparam s_idle_in_6_bit = 10; localparam s_idle_in_6 = 1 << s_idle_in_6_bit ;
localparam s_idle_in_5_bit = 11; localparam s_idle_in_5 = 1 << s_idle_in_5_bit ;
localparam s_idle_in_4_bit = 12; localparam s_idle_in_4 = 1 << s_idle_in_4_bit ;
localparam s_idle_in_3_bit = 13; localparam s_idle_in_3 = 1 << s_idle_in_3_bit ;
localparam s_idle_in_2_bit = 14; localparam s_idle_in_2 = 1 << s_idle_in_2_bit ;
localparam s_idle_in_1_bit = 15; localparam s_idle_in_1 = 1 << s_idle_in_1_bit ;
(* onehot *)
reg [15:0] state = s_init;
// ----------------------------------------------------------
// -- Access control wires
// ----------------------------------------------------------
reg [14:0] reset_counter = sdram_startup_cycles;
reg [7:0] refresh_counter = 0;
reg refresh_pending = 1;
reg rd_sticky = 0;
reg [3:0] wmask_sticky = 4'b0000;
wire stillatwork = ~(state[s_read_5_bit] | state[s_write_2_bit]);
wire [8:0] refresh_counterN = refresh_counter - 1;
// ----------------------------------------------------------
// -- The memory controller
// ----------------------------------------------------------
always @(posedge clk)
if(!resetn) begin
state <= s_init;
reset_counter <= sdram_startup_cycles; // Counts backwards to zero
busy <= 0; // Technically, we are busy with initialisation, but there are no ongoing read or write requests
rd_sticky <= 0;
wmask_sticky <= 4'b0000;
sd_cke <= 0;
end else begin
// FemtoRV32 pulses read and write lines high for exactly one clock cycle.
// Address and data lines keep stable until busy is released.
// Therefore: Take note of the requested read or write, and assert busy flag immediately.
busy <= ((|wmask) | rd) | (busy & stillatwork );
rd_sticky <= rd | (rd_sticky & stillatwork );
wmask_sticky <= wmask | (wmask_sticky & {4{stillatwork}} );
// Schedule refreshes regularly
refresh_counter <= refresh_counterN[8] ? sdram_refresh_cycles : refresh_counterN[7:0];
refresh_pending <= (refresh_pending & ~state[s_idle_bit]) | refresh_counterN[8];
(* parallel_case *)
case(1'b1)
// Processor can already request the first read or write here, but has to wait then:
state[s_init_bit]: begin
//------------------------------------------------------------------------
//-- This is the initial startup state, where we wait for at least 100us
//-- before starting the start sequence
//--
//-- The initialisation is sequence is
//-- * de-assert SDRAM_CKE
//-- * 100us wait,
//-- * assert SDRAM_CKE
//-- * wait at least one cycle,
//-- * PRECHARGE
//-- * wait 2 cycles
//-- * REFRESH,
//-- * tREF wait
//-- * REFRESH,
//-- * tREF wait
//-- * LOAD_MODE_REG
//-- * 2 cycles wait
//------------------------------------------------------------------------
sd_ba <= 2'b00; // Reserved for future use in mode configuration
sd_dqm <= 2'b11; // Data bus in High-Z state
sd_data_drive <= 0; // Do not drive the data bus now
case (reset_counter) // Counts from a large value down to zero
33: begin sd_cke <= 1; end
// Ensure all rows are closed
31: begin {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_PRECHARGE; sd_addr <= 13'b0010000000000; end
// These refreshes need to be at least tRFC (63ns) apart
23: begin {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_AUTO_REFRESH; end
15: begin {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_AUTO_REFRESH; end
// Now load the mode register
7: begin {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_LOAD_MODE; sd_addr <= MODE; end
default: {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP;
endcase
reset_counter <= reset_counter - 1;
if (reset_counter == 0) state <= s_idle;
end
// New read or write requests from the processor may arrive in these states:
//-----------------------------------------------------
//-- Additional NOPs to meet timing requirements
//-----------------------------------------------------
state[s_idle_in_6_bit]: begin state <= s_idle_in_5; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
state[s_idle_in_5_bit]: begin state <= s_idle_in_4; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
state[s_idle_in_4_bit]: begin state <= s_idle_in_3; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
state[s_idle_in_3_bit]: begin state <= s_idle_in_2; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
state[s_idle_in_2_bit]: begin state <= s_idle_in_1; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
state[s_idle_in_1_bit]: begin state <= s_idle; {sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP; end
// Refresh cycle needs tRFC (63ns), so 6 idle cycles are needed @ 100MHz
//-----------------------------------------------------
//-- Dispatch all possible actions while idling (NOP)
//-----------------------------------------------------
state[s_idle_bit]: begin
sd_ba <= addr[23:22]; // Bank select, 2 bits
sd_addr <= {addr[25:24], addr[21:11]} ; // RA0-RA12: 8192 Row address
{sd_cs, sd_ras, sd_cas, sd_we} <= refresh_pending ? CMD_AUTO_REFRESH :
(|wmask_sticky) | rd_sticky ? CMD_ACTIVE :
CMD_NOP;
state <= refresh_pending ? s_idle_in_2 : // *** Experimental result: Direct transition to s_idle does not work @ 40 MHz, s_idle_in_1 is unstable, sd_idle_in_2 is fine.
(|wmask_sticky) | rd_sticky ? s_activate :
s_idle;
end
// Busy flag is set while state machine is in the following states:
//-----------------------------------------------------
//-- Opening the row ready for reads or writes
//-----------------------------------------------------
state[s_activate_bit]: begin
sd_data_drive <= ~rd_sticky; // Drive or release bus early, before the SDRAM chip takes over to drive these lines
{sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP;
state <= rd_sticky ? s_read_1 : s_write_1;
end
// RAS-to-CAS delay, also necessary for precharge, used in this state machine: 2 cycles.
// Specification of AS4C32M16SB-7TCN: 21 ns --> Good for 1/(21e-9 / 2) = 95.23 MHz
//-----------------------------------------------------
//-- Processing the read transaction
//-----------------------------------------------------
state[s_read_1_bit]: begin
sd_dqm <= 2'b00; // SDRAM chip shall drive the bus lines
{sd_cs, sd_ras, sd_cas, sd_we} <= CMD_READ;
sd_addr <= {3'b001, addr[10:2], 1'b0}; // Bit 10: Auto-precharge. CA0-CA9: 1024 Column address.
state <= s_read_2;
end
state[s_read_2_bit]: begin
{sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP;
state <= s_read_3;
end
state[s_read_3_bit]: state <= s_read_4;
state[s_read_4_bit]: begin
dout[15:0] <= sd_data_in;
state <= s_read_5;
end
// Busy is cleared when reaching this state, fulfilling the request:
state[s_read_5_bit]: begin
dout[31:16] <= sd_data_in;
state <= s_idle; // *** Experimental result: Direct transition to s_idle is fine @ 40 MHz
end
// Precharge (which is automatic here) needs 21 ns, therefore 2 idle cycles need to be inserted
//-----------------------------------------------------
// -- Processing the write transaction
//-----------------------------------------------------
state[s_write_1_bit]: begin
sd_addr <= {3'b001, addr[10:2], 1'b0}; // Bit 10: Auto-precharge. CA0-CA9: 1024 Column address.
sd_data_out <= din[15:0];
sd_dqm <= ~wmask_sticky[1:0];
{sd_cs, sd_ras, sd_cas, sd_we} <= CMD_WRITE;
state <= s_write_2;
end
// Busy is cleared when reaching this state, fulfilling the request:
state[s_write_2_bit]: begin
sd_data_out <= din[31:16];
sd_dqm <= ~wmask_sticky[3:2];
{sd_cs, sd_ras, sd_cas, sd_we} <= CMD_NOP;
state <= s_idle_in_2; // *** Experimental result: s_idle_in_1 does not work @ 40 MHz, s_idle_in_2 is fine.
end
// Write needs 14 ns internally, then Precharge needs 21 ns, therefore 3 idle cycles need to be inserted
endcase
end
endmodule

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
#!/bin/sh
iverilog -o test_sdram.vvp -s test_sdram test_sdram.v ../muchtoremember.v mt48lc16m16a2.v
vvp *.vvp

View File

@@ -0,0 +1,116 @@
`timescale 1ns/100ps // 1 ns time unit, 100 ps resolution
`default_nettype none // Makes it easier to detect typos !
module test_sdram;
reg clk;
always #12.5 clk = !clk;
reg resetq = 0;
/***************************************************************************/
// SD-RAM-Controller
/***************************************************************************/
wire [31:0] sdram_rdata;
wire sdram_busy;
reg [3:0] sdram_wmask = 4'b0000;
reg sdram_rd = 0;
muchtoremember sdram(
// Physical interface
.sd_d(sdram_d),
.sd_addr(sdram_a),
.sd_dqm(sdram_dqm),
.sd_cs(sdram_csn),
.sd_ba(sdram_ba),
.sd_we(sdram_wen),
.sd_ras(sdram_rasn),
.sd_cas(sdram_casn),
.sd_clk(sdram_clk),
.sd_cke(sdram_cke),
// Internal bus interface
.clk(clk),
.resetn(resetq),
.addr(mem_address[25:0]),
.wmask(sdram_wmask),
.rd(sdram_rd),
.din(mem_wdata),
.dout(sdram_rdata),
.busy(sdram_busy)
);
wire [31:0] mem_address = 0;
wire [31:0] mem_wdata = 32'h00030004;
/***************************************************************************/
// 64 MB SD-RAM
/***************************************************************************/
wire sdram_csn; // chip select
wire sdram_clk; // clock to SDRAM
wire sdram_cke; // clock enable to SDRAM
wire sdram_rasn; // SDRAM RAS
wire sdram_casn; // SDRAM CAS
wire sdram_wen; // SDRAM write-enable
wire [12:0] sdram_a; // SDRAM address bus
wire [1:0] sdram_ba; // SDRAM bank-address
wire [1:0] sdram_dqm; // byte select
wire [15:0] sdram_d;
mt48lc16m16a2 memory(
.Dq(sdram_d),
.Addr(sdram_a),
.Ba(sdram_ba),
.Clk(sdram_clk),
.Cke(sdram_cke),
.Cs_n(sdram_csn),
.Ras_n(sdram_rasn),
.Cas_n(sdram_casn),
.We_n(sdram_wen),
.Dqm(sdram_dqm)
);
/***************************************************************************/
// Test sequence
/***************************************************************************/
integer i;
initial begin
$dumpfile("sdram.vcd"); // create a VCD waveform dump
$dumpvars(0, test_sdram); // dump variable changes in the testbench
// and all modules under it
clk = 0;
resetq = 0;
@(negedge clk);
resetq = 1;
for (i = 0; i < 11000; i = i + 1) begin
@(negedge clk);
end
$monitor("t=%d: sdram_d = %8h Busy %b sdram_rdata %8h", $time, sdram_d, sdram_busy, sdram_rdata);
$display(" --- Write access ---");
sdram_wmask = 15;
@(negedge clk);
sdram_wmask = 0;
for (i = 0; i < 64; i = i + 1) begin
@(negedge clk);
end
$display(" --- Read access ---");
sdram_rd = 1;
@(negedge clk);
sdram_rd = 0;
for (i = 0; i < 64; i = i + 1) begin
@(negedge clk);
end
$finish();
end
endmodule

View File

@@ -0,0 +1,6 @@
#!/bin/sh
rm -f *.vvp
rm -f *.vcd

608
RTL/femtosoc.v Normal file
View File

@@ -0,0 +1,608 @@
// femtorv32, a minimalistic RISC-V RV32I core
// (minus SYSTEM and FENCE that are not implemented)
//
// Bruno Levy, May-June 2020
//
// This file: the "System on Chip" that goes with femtorv32.
/*************************************************************************************/
`default_nettype none // Makes it easier to detect typos !
`include "femtosoc_config.v" // User configuration of processor and SOC.
`include "PLL/femtopll.v" // The PLL (generates clock at NRV_FREQ)
`include "DEVICES/uart.v" // The UART (serial port over USB)
`include "DEVICES/SSD1351_1331.v" // The OLED display
`include "DEVICES/MappedSPIFlash.v" // Idem, but mapped in memory
`include "DEVICES/MAX7219.v" // 8x8 led matrix driven by a MAX7219 chip
`include "DEVICES/LEDs.v" // Driver for 4 leds
`include "DEVICES/SDCard.v" // Driver for SDCard (just for bitbanging for now)
`include "DEVICES/Buttons.v" // Driver for the buttons
`include "DEVICES/FGA.v" // Femto Graphic Adapter
`include "DEVICES/HardwareConfig.v" // Constant registers to query hardware config.
// The Ice40UP5K has ample quantities (128 KB) of single-ported RAM that can be
// used as system RAM (but cannot be inferred, uses a special block).
`ifdef ICE40UP5K_SPRAM
`include "DEVICES/ice40up5k_spram.v"
`endif
/*************************************************************************************/
`ifndef NRV_RESET_ADDR
`define NRV_RESET_ADDR 0
`endif
`ifndef NRV_ADDR_WIDTH
`define NRV_ADDR_WIDTH 24
`endif
/*************************************************************************************/
module femtosoc(
`ifdef NRV_IO_LEDS
`ifdef FOMU
output rgb0,rgb1,rgb2,
`else
output D1,D2,D3,D4,D5,
`endif
`endif
`ifdef NRV_IO_SSD1351_1331
output oled_DIN, oled_CLK, oled_CS, oled_DC, oled_RST,
`endif
`ifdef NRV_IO_UART
input RXD,
output TXD,
`endif
`ifdef NRV_IO_MAX7219
output ledmtx_DIN, ledmtx_CS, ledmtx_CLK,
`endif
`ifdef NRV_SPI_FLASH
inout spi_mosi, inout spi_miso, output spi_cs_n,
`ifndef ULX3S
output spi_clk, // ULX3S has spi clk shared with ESP32, using USRMCLK (below)
`endif
`endif
`ifdef NRV_IO_SDCARD
output sd_mosi, input sd_miso, output sd_cs_n, output sd_clk,
`endif
`ifdef NRV_IO_BUTTONS
`ifdef ICE_FEATHER
input [3:0] buttons,
`else
input [5:0] buttons,
`endif
`endif
`ifdef ULX3S
output wifi_en,
`endif
input RESET,
`ifdef FOMU
output usb_dp, usb_dn, usb_dp_pu,
`endif
`ifdef NRV_IO_FGA
output [3:0] gpdi_dp,
`endif
`ifdef NRV_IO_IRDA
output irda_TXD,
input irda_RXD,
output irda_SD,
`endif
input pclk
);
/********************* Technicalities **************************************/
// On the ULX3S, deactivate the ESP32 so that it does not interfere with
// the other devices (especially the SDCard).
`ifdef ULX3S
assign wifi_en = 1'b0;
`endif
// On the ULX3S, the CLK pin of the SPI is multiplexed with the ESP32.
// It can be accessed using the USRMCLK primitive of the ECP5
// as follows.
`ifdef NRV_SPI_FLASH
`ifdef ULX3S
wire spi_clk;
wire tristate = 1'b0;
`ifndef BENCH
USRMCLK u1 (.USRMCLKI(spi_clk), .USRMCLKTS(tristate));
`endif
`endif
`endif
`ifdef FOMU
// Internal wires for the LEDs,
// need to convert to signal for RGB led
wire D1,D2,D3,D4,D5;
// On the FOMU, USB pins should be statically driven if not used
assign usb_dp = 1'b0;
assign usb_dn = 1'b0;
assign usb_dp_pu = 1'b0;
`endif
wire clk;
femtoPLL #(
.freq(`NRV_FREQ)
) pll(
.pclk(pclk),
.clk(clk)
);
// A little delay for sending the reset signal after startup.
// Explanation here: (ice40 BRAM reads incorrect values during
// first cycles).
// http://svn.clifford.at/handicraft/2017/ice40bramdelay/README
// On the ICE40-UP5K, 4096 cycles do not suffice (-> 65536 cycles)
`ifdef ICE_STICK
reg [11:0] reset_cnt = 0;
`else
reg [15:0] reset_cnt = 0;
`endif
wire reset = &reset_cnt;
/* verilator lint_off WIDTH */
`ifdef NRV_NEGATIVE_RESET
always @(posedge clk,negedge RESET) begin
if(!RESET) begin
reset_cnt <= 0;
end else begin
reset_cnt <= reset_cnt + !reset;
end
end
`else
always @(posedge clk,posedge RESET) begin
if(RESET) begin
reset_cnt <= 0;
end else begin
reset_cnt <= reset_cnt + !reset;
end
end
`endif
/* verilator lint_on WIDTH */
/***************************************************************************************************
/*
* Memory and memory interface
* memory map:
* address[21:2] RAM word address (4 Mb max).
* address[23:22] 00: RAM
* 01: IO page (1-hot) (starts at 0x400000)
* 10: SPI Flash page (starts at 0x800000)
*/
// The memory bus.
wire [31:0] mem_address; // 24 bits are used internally. The two LSBs are ignored (using word addresses)
wire [3:0] mem_wmask; // mem write mask and strobe /write Legal values are 000,0001,0010,0100,1000,0011,1100,1111
wire [31:0] mem_rdata; // processor <- (mem and peripherals)
wire [31:0] mem_wdata; // processor -> (mem and peripherals)
wire mem_rstrb; // mem read strobe. Goes high to initiate memory write.
wire mem_rbusy; // processor <- (mem and peripherals). Stays high until a read transfer is finished.
wire mem_wbusy; // processor <- (mem and peripherals). Stays high until a write transfer is finished.
wire mem_wstrb = |mem_wmask; // mem write strobe, goes high to initiate memory write (deduced from wmask)
// IO bus.
`ifdef NRV_MAPPED_SPI_FLASH
wire mem_address_is_ram = (mem_address[23:22] == 2'b00);
wire mem_address_is_io = (mem_address[23:22] == 2'b01);
wire mem_address_is_spi_flash = (mem_address[23:22] == 2'b10);
wire mapped_spi_flash_rbusy;
wire [31:0] mapped_spi_flash_rdata;
MappedSPIFlash mapped_spi_flash(
.clk(clk),
.rstrb(mem_rstrb && mem_address_is_spi_flash),
.word_address(mem_address[21:2]),
.rdata(mapped_spi_flash_rdata),
.rbusy(mapped_spi_flash_rbusy),
.CLK(spi_clk),
.CS_N(spi_cs_n),
`ifdef SPI_FLASH_FAST_READ_DUAL_IO
.IO({spi_miso,spi_mosi})
`else
.MISO(spi_miso),
.MOSI(spi_mosi)
`endif
);
`else
wire mem_address_is_io = mem_address[22];
wire mem_address_is_ram = !mem_address[22];
`endif
reg [31:0] io_rdata;
wire [31:0] io_wdata = mem_wdata;
wire io_rstrb = mem_rstrb && mem_address_is_io;
wire io_wstrb = mem_wstrb && mem_address_is_io;
wire [19:0] io_word_address = mem_address[21:2]; // word offset in io page
wire io_rbusy;
wire io_wbusy;
assign mem_rbusy = io_rbusy
`ifdef NRV_MAPPED_SPI_FLASH
| mapped_spi_flash_rbusy
`endif
;
assign mem_wbusy = io_wbusy;
`ifdef NRV_IO_FGA
wire mem_address_is_vram = mem_address[21];
`else
parameter mem_address_is_vram = 1'b0;
`endif
wire [19:0] ram_word_address = mem_address[21:2];
// Using the 128 KBytes of SPRAM (single-ported RAM) embedded in the Ice40 UP5K
`ifdef ICE40UP5K_SPRAM
wire [31:0] ram_rdata;
wire spram_wr = mem_address_is_ram && !mem_address_is_vram;
ice40up5k_spram RAM(
.clk(clk),
.wen({4{spram_wr}} & mem_wmask),
.addr(ram_word_address[14:0]),
.wdata(mem_wdata),
.rdata(ram_rdata)
);
`else // Synthethizing BRAM
(* no_rw_check *)
reg [31:0] RAM[0:(`NRV_RAM/4)-1];
reg [31:0] ram_rdata;
// Initialize the RAM with the generated firmware hex file.
// The hex file is generated by the bundled elf-2-verilog converter (see TOOLS/FIRMWARE_WORDS_SRC)
`ifndef NRV_RUN_FROM_SPI_FLASH
initial begin
$readmemh("FIRMWARE/firmware.hex",RAM);
end
`endif
// The power of YOSYS: it infers BRAM primitives automatically ! (and recognizes
// masked writes, amazing ...)
/* verilator lint_off WIDTH */
always @(posedge clk) begin
if(mem_address_is_ram && !mem_address_is_vram) begin
if(mem_wmask[0]) RAM[ram_word_address][ 7:0 ] <= mem_wdata[ 7:0 ];
if(mem_wmask[1]) RAM[ram_word_address][15:8 ] <= mem_wdata[15:8 ];
if(mem_wmask[2]) RAM[ram_word_address][23:16] <= mem_wdata[23:16];
if(mem_wmask[3]) RAM[ram_word_address][31:24] <= mem_wdata[31:24];
end
ram_rdata <= RAM[ram_word_address];
end
/* verilator lint_on WIDTH */
`endif
`ifdef NRV_IO_FGA
wire [31:0] FGA_rdata;
FGA graphic_adapter(
.pclk(pclk), // board clock
.clk(clk), // femtorv32 clock
.sel(mem_address_is_ram && mem_address_is_vram),
.mem_wmask(mem_wmask),
.mem_address(mem_address[16:0]),
.mem_wdata(mem_wdata),
.gpdi_dp(gpdi_dp),
.io_rstrb(io_rstrb),
.io_wstrb(io_wstrb),
.sel_cntl(io_word_address[IO_FGA_CNTL_bit]),
.sel_dat(io_word_address[IO_FGA_DAT_bit]),
.rdata(FGA_rdata)
);
`endif
`ifdef NRV_MAPPED_SPI_FLASH
assign mem_rdata = mem_address_is_io ? io_rdata :
mem_address_is_ram ? ram_rdata :
mapped_spi_flash_rdata;
`else
assign mem_rdata = mem_address_is_io ? io_rdata : ram_rdata;
`endif
/***************************************************************************************************
/*
* Memory-mapped IO
* Mapped IO uses "one-hot" addressing, to make decoder
* simpler (saves a lot of LUTs), as in J1/swapforth,
* thanks to Matthias Koch(Mecrisp author) for the idea !
* The included files contains the symbolic constants that
* determine which device uses which bit.
*/
`include "DEVICES/HardwareConfig_bits.v"
/*
* Devices are components plugged to the IO memory bus.
* A few words follow in case you want to write your own devices:
*
* Each device has one or several register(s). Each register
* can be optionally read or/and written.
* - Each register is selected by a .sel_xxx signal (where xxx
* is the name of the register). With the 1-hot encoding that
* I'm using, .sel_xxx is systematically one of the bits of the
* IO word address (it is also possible to write a real
* address decoder, at the expense of eating-up a larger
* number of LUTs).
* - If the device requires wait cycles for writing and/or reading,
* it can have a .wbusy and/or .rbusy signal(s). All the .wbusy
* and .rbusy signals of all the devices are ORed at the end of
* this file to form the .io_rbusy and .io_wbusy signals.
* - If the device has read access, then it has a 32-bits .xxx_rdata
* signal, that returns 32'b0 if the device is not selected, or the
* read data otherwise. All the .xxx_rdata signals of all the devices
* are ORed at the end of this file to form the 32-bits io_rdata signal.
* - Finally, of course, each device is plugged to some pins of the FPGA,
* the corresponding signals are in capital letters.
*/
/*********************** Hardware configuration ************/
/*
* Three memory-mapped constant registers that make it easy for
* client code to query installed RAM and configured devices
* (this one does not use any pin, of course).
* Uses some LUTs, a bit stupid, but more comfortable, so that
* I do not need to change the software on the SDCard each time
* I test a different hardware configuration.
*/
`ifdef NRV_IO_HARDWARE_CONFIG
wire [31:0] hwconfig_rdata;
HardwareConfig hwconfig(
.clk(clk),
.sel_memory(io_word_address[IO_HW_CONFIG_RAM_bit]),
.sel_devices(io_word_address[IO_HW_CONFIG_DEVICES_bit]),
.sel_cpuinfo(io_word_address[IO_HW_CONFIG_CPUINFO_bit]),
.rdata(hwconfig_rdata)
);
`endif
/*********************** Four LEDs ************************/
`ifdef NRV_IO_LEDS
wire [31:0] leds_rdata;
LEDDriver leds(
`ifdef NRV_IO_IRDA
.irda_TXD(irda_TXD),
.irda_RXD(irda_RXD),
.irda_SD(irda_SD),
`endif
.clk(clk),
.rstrb(io_rstrb),
.wstrb(io_wstrb),
.sel(io_word_address[IO_LEDS_bit]),
.wdata(io_wdata),
.rdata(leds_rdata),
.LED({D4,D3,D2,D1})
);
`endif
/********************** SSD1351/SSD1331 oled display ******/
`ifdef NRV_IO_SSD1351_1331
wire SSD1351_wbusy;
SSD1351 oled_display(
.clk(clk),
.wstrb(io_wstrb),
.sel_cntl(io_word_address[IO_SSD1351_CNTL_bit]),
.sel_cmd(io_word_address[IO_SSD1351_CMD_bit]),
.sel_dat(io_word_address[IO_SSD1351_DAT_bit]),
.sel_dat16(io_word_address[IO_SSD1351_DAT16_bit]),
.wdata(io_wdata),
.wbusy(SSD1351_wbusy),
.DIN(oled_DIN),
.CLK(oled_CLK),
.CS(oled_CS),
.DC(oled_DC),
.RST(oled_RST)
);
`endif
/********************** UART ****************************************/
`ifdef NRV_IO_UART
// Internal wires to connect IO buffers to UART
wire RXD_internal;
wire TXD_internal;
`ifdef ULX3S
`ifndef BENCH_OR_LINT
// On the ULX3S, we need to latch RXD, using the latch
// embedded in the input buffer. If we do not do that,
// then we unpredictably get garbage on the UART.
// The two primitives BB (bidirectional three-state buffer)
// and IFS1P3BX (latch in IO pin) are interpreted by the
// synthesis tool as an IO cell.
wire RXD_btw;
BB RXD_bb(
.I(1'b0),
.O(RXD_btw),
.B(RXD),
.T(1'b1)
);
IFS1P3BX RXD_pin(
.SCLK(clk),
.D(RXD_btw),
.Q(RXD_internal),
.PD(1'b0)
);
assign TXD = TXD_internal; // For now, do not latch output (but we may need to)
`define UART_IO_BUFFER
`endif
`endif
// For other boards, we directly connect RXD and TXD to the UART (but we may need
// to latch).
`ifndef UART_IO_BUFFER
assign RXD_internal = RXD;
assign TXD = TXD_internal;
`endif
wire uart_brk;
wire [31:0] uart_rdata;
UART uart(
.clk(clk),
.rstrb(io_rstrb),
.wstrb(io_wstrb),
.sel_dat(io_word_address[IO_UART_DAT_bit]),
.sel_cntl(io_word_address[IO_UART_CNTL_bit]),
.wdata(io_wdata),
.rdata(uart_rdata),
.RXD(RXD_internal),
.TXD(TXD_internal),
.brk(uart_brk)
);
`else
wire uart_brk = 1'b0;
`endif
/********** MAX7219 led matrix driver *******************************/
`ifdef NRV_IO_MAX7219
wire max7219_wbusy;
MAX7219 max7219(
.clk(clk),
.wstrb(io_wstrb),
.sel(io_word_address[IO_MAX7219_DAT_bit]),
.wdata(io_wdata),
.wbusy(max7219_wbusy),
.DIN(ledmtx_DIN),
.CS(ledmtx_CS),
.CLK(ledmtx_CLK)
);
`endif
/********************* SPI SDCard *********************************/
/*
* This one has an output register directly wired to the CLK,MOSI,CS_N
* and an input register directly wired to MISO. The software driver
* implements the SPI protocol by bit-banging (see FIRMWARE/LIBFEMTORV32/spi_sd.c).
* One day I'll replace it with a hardware driver... if I have time !
* ... a generic SPI driver would be good to have also.
*/
`ifdef NRV_IO_SDCARD
wire [31:0] sdcard_rdata;
SDCard sdcard(
.clk(clk),
.rstrb(io_rstrb),
.wstrb(io_wstrb),
.sel(io_word_address[IO_SDCARD_bit]),
.wdata(io_wdata),
.rdata(sdcard_rdata),
.CLK(sd_clk),
.MISO(sd_miso),
.MOSI(sd_mosi),
.CS_N(sd_cs_n)
);
`endif
/********************* Buttons *************************************/
/*
* Directly wired to the buttons.
*/
`ifdef NRV_IO_BUTTONS
wire [31:0] buttons_rdata;
Buttons buttons_driver(
.sel(io_word_address[IO_BUTTONS_bit]),
.rdata(buttons_rdata),
.BUTTONS(buttons)
);
`endif
/************** io_rdata, io_rbusy and io_wbusy signals *************/
/*
* io_rdata is latched. Not mandatory, but probably allow higher freq, to be tested.
*/
always @(posedge clk) begin
io_rdata <= 0
`ifdef NRV_IO_HARDWARE_CONFIG
| hwconfig_rdata
`endif
`ifdef NRV_IO_LEDS
| leds_rdata
`endif
`ifdef NRV_IO_UART
| uart_rdata
`endif
`ifdef NRV_IO_SDCARD
| sdcard_rdata
`endif
`ifdef NRV_IO_BUTTONS
| buttons_rdata
`endif
`ifdef NRV_IO_FGA
| FGA_rdata
`endif
;
end
// For now, we got no device that has
// blocking reads (SPI flash blocks on
// write address and waits for read data).
assign io_rbusy = 0 ;
assign io_wbusy = 0
`ifdef NRV_IO_SSD1351_1331
| SSD1351_wbusy
`endif
`ifdef NRV_IO_MAX7219
| max7219_wbusy
`endif
`ifdef NRV_IO_SPI_FLASH
| spi_flash_wbusy
`endif
;
/****************************************************************/
/* And last but not least, the processor */
reg error=1'b0;
FemtoRV32 #(
.ADDR_WIDTH(`NRV_ADDR_WIDTH),
.RESET_ADDR(`NRV_RESET_ADDR)
) processor(
.clk(clk),
.mem_addr(mem_address),
.mem_wdata(mem_wdata),
.mem_wmask(mem_wmask),
.mem_rdata(mem_rdata),
.mem_rstrb(mem_rstrb),
.mem_rbusy(mem_rbusy),
.mem_wbusy(mem_wbusy),
`ifdef NRV_INTERRUPTS
.interrupt_request(1'b0),
`endif
.reset(reset && !uart_brk)
);
`ifdef NRV_IO_LEDS
assign D5 = error;
`ifdef FOMU
SB_RGBA_DRV #(
.CURRENT_MODE("0b1"), // half current
.RGB0_CURRENT("0b000011"), // 4 mA
.RGB1_CURRENT("0b000011"), // 4 mA
.RGB2_CURRENT("0b000011") // 4 mA
) RGBA_DRIVER (
.CURREN(1'b1),
.RGBLEDEN(1'b1),
.RGB0PWM(D1),
.RGB1PWM(D2),
.RGB2PWM(D3),
.RGB0(rgb0),
.RGB1(rgb1),
.RGB2(rgb2)
);
`endif
`endif
endmodule

72
RTL/femtosoc_bench.v Normal file
View File

@@ -0,0 +1,72 @@
/*
* testbench for femtosoc/femtorv32
*
* 1. select one of the processors by uncommenting one of the
* lines NRV_FEMTORV32_XXX
*
* 2. edit FIRMWARE/config.mk and make sure ARCH corresponds to
* selected processor.
*
* $ cd FIRMWARE/EXAMPLES
* $ make hello.hex
* $ cd ../..
* $ make testbench
*
* Uncomment VERBOSE for extensive information (states ...)
*/
`timescale 1ns/1ns
//`include "femtosoc_config.v"
//
//`ifndef BENCH
//`define BENCH
//`endif
`define VERBOSE // Uncomment to have detailed log traces of all states
`include "femtosoc.v"
`ifdef VERILATOR
module femtoRV32_bench(
input pclk,
output oled_DIN, oled_CLK, oled_CS, oled_DC, oled_RST
);
`else
module femtoRV32_bench();
reg pclk;
`endif
wire [4:0] LEDs;
wire TXD;
femtosoc uut(
.pclk(pclk),
.TXD(TXD),
.RXD(1'b0),
.RESET(1'b0),
`ifdef NRV_IO_SSD1351_1331
.oled_DIN(oled_DIN),
.oled_CLK(oled_CLK),
.oled_CS(oled_CS),
.oled_DC(oled_DC),
.oled_RST(oled_RST),
`endif
.D1(LEDs[0]),
.D2(LEDs[1]),
.D3(LEDs[2]),
.D4(LEDs[3]),
.D5(LEDs[4])
);
`ifndef VERILATOR
initial begin
pclk = 0;
forever begin
#1 pclk = ~pclk;
end
end
`endif
endmodule

149
RTL/femtosoc_config.v Normal file
View File

@@ -0,0 +1,149 @@
// Configuration file for femtosoc/femtorv32
`ifdef BENCH_VERILATOR
`define BENCH
`endif
`ifdef ULX3S
`include "CONFIGS/ulx3s_config.v"
`endif
`ifdef ICE_STICK
`include "CONFIGS/icestick_config.v"
`endif
`ifdef ICE_BREAKER
`include "CONFIGS/icebreaker_config.v"
`endif
`ifdef ECP5_EVN
`include "CONFIGS/ecp5evn_config.v"
`endif
`ifdef ARTY
`include "CONFIGS/arty_config.v"
`endif
`ifdef ICE_SUGAR_NANO
`include "CONFIGS/icesugarnano_config.v"
`endif
`ifdef CMODA7
`include "CONFIGS/cmod_a7_config.v"
`endif
`ifdef BENCH_VERILATOR
`include "CONFIGS/bench_config.v"
`endif
`ifndef NRV_CONFIGURED
`include "CONFIGS/generic_config.v"
`endif
/******************************************************************************/
/*
* Uncomment if the RESET button is wired and active low:
* (wire a push button and a pullup resistor to
* pin 47 or change in nanorv.pcf).
*/
`ifdef ICE_STICK
//`define NRV_NEGATIVE_RESET
`endif
`ifdef FOMU
`define NRV_NEGATIVE_RESET
`endif
`ifdef NRV_IO_SPI_FLASH
`define NRV_SPI_FLASH
`endif
`ifdef NRV_MAPPED_SPI_FLASH
`define NRV_SPI_FLASH
`endif
/*
* On the ECP5 evaluation board, there is already a wired button, active low,
* wired to the "P4" ball of the ECP5 (see ecp5_evn.lpf)
*/
`ifdef ECP5_EVN
`define NRV_NEGATIVE_RESET
`endif
// Toggle FPGA defines (ICE40, ECP5) in function of board defines (ICE_STICK, ECP5_EVN)
// Board defines are set in Makefile.
`ifdef ICE_STICK
`define ICE40
`endif
`ifdef ICE_BREAKER
`define ICE40
`endif
`ifdef ICE_FEATHER
`define ICE40
`endif
`ifdef ICE_SUGAR
`define ICE40
`endif
`ifdef ICE_SUGAR_NANO
`define ICE40
`define PASSTHROUGH_PLL
`endif
`ifdef FOMU
`define ICE40
`endif
`ifdef ECP5_EVN
`define ECP5
`endif
`ifdef ULX3S
`define ECP5
`endif
/******************************************************************************************************************/
/* Processor */
`define NRV_IS_IO_ADDR(addr) |addr[23:22] // Asserted if address is in IO space (then it needs additional wait states)
`include "PROCESSOR/utils.v"
`ifdef NRV_FEMTORV32_QUARK
`include "PROCESSOR/femtorv32_quark.v" // Minimalistic version of the processor for IceStick (RV32I)
`endif
`ifdef NRV_FEMTORV32_QUARK_BICYCLE
`include "PROCESSOR/femtorv32_quark_bicycle.v" // Quark with Matthias's 2 CPI mode and barrel shifter (RV32I)
`endif
`ifdef NRV_FEMTORV32_TACHYON
`include "PROCESSOR/femtorv32_tachyon.v" // Version for the IceStick with higher maxfreq (RV32I)
`endif
`ifdef NRV_FEMTORV32_ELECTRON
`include "PROCESSOR/femtorv32_electron.v" // RV32IM with barrel shifter
`endif
`ifdef NRV_FEMTORV32_INTERMISSUM
`include "PROCESSOR/femtorv32_intermissum.v" // RV32IM with barrel shifter and interrupts
`endif
`ifdef NRV_FEMTORV32_GRACILIS
`include "PROCESSOR/femtorv32_gracilis.v" // RV32IMC with barrel shifter and interrupts
`endif
`ifdef NRV_FEMTORV32_PETITBATEAU
`include "PROCESSOR/femtorv32_petitbateau.v" // under development, RV32IMFC
`endif
`ifdef NRV_FEMTORV32_TESTDRIVE
`include "PROCESSOR/femtorv32_testdrive.v" // CPU under test
`endif
/******************************************************************************************************************/

View File

@@ -0,0 +1,305 @@
// Special version of femtosoc for mecrisp-quintus (Forth interpreter) on IceStick, by Matthias Koch
// mecrisp website: http://mecrisp.sourceforge.net/
`default_nettype none // Makes it easier to detect typos !
`define NRV_MINIRV32 // Mini config, can execute code stored in SPI flash from 1Mb offset (mapped to address 0x800000)
`define NRV_RUN_FROM_SPI_FLASH // Running code from the SPI flash (changes the constant for delay loops)
`define NRV_RESET_ADDR 24'h810000 // Directly jump into mapped SPI Flash,
`define NRV_COUNTER_WIDTH 32 // Number of bits in click counter
`include "PROCESSOR/femtorv32_quark.v" // Minimalistic version of the processor
`include "DEVICES/uart_picosoc_shrunk.v"
`include "DEVICES/MappedSPIFlash.v"
module femtosoc(
input oscillator,
output D1, D2, D3, D4, D5,
output TXD,
input RXD,
output spi_clk,
output spi_cs_n,
inout spi_mosi,
inout spi_miso,
// input IR_RX,
// output IR_TX,
// output IR_SD,
inout PIO1_02, // PMOD 1
inout PIO1_03, // PMOD 2
inout PIO1_04, // PMOD 3
inout PIO1_05, // PMOD 4
inout PIO1_06, // PMOD 5
inout PIO1_07, // PMOD 6
inout PIO1_08, // PMOD 7
inout PIO1_09, // PMOD 8
inout PIO0_02, // Header 1
inout PIO0_03, // Header 2
inout PIO0_04, // Header 3
inout PIO0_05, // Header 4
inout PIO0_06, // Header 5
inout PIO0_07, // Header 6
inout PIO0_08, // Header 7
inout PIO0_09, // Header 8
inout PIO2_10, // Header 1
inout PIO2_11, // Header 2
inout PIO2_12, // Header 3
inout PIO2_13, // Header 4
inout PIO2_14, // Header 5
inout PIO2_15, // Header 6
inout PIO2_16, // Header 7
inout PIO2_17, // Header 8
input reset_button
);
// ###### Clock #########################################
wire clk; // Configured for 48 MHz
SB_PLL40_CORE #(.FEEDBACK_PATH("SIMPLE"),
.PLLOUT_SELECT("GENCLK"),
.DIVR(4'b0000),
.DIVF(7'b0111111),
.DIVQ(3'b100),
.FILTER_RANGE(3'b001),
) uut (
.REFERENCECLK(oscillator),
.PLLOUTCORE(clk),
.RESETB(1'b1),
.BYPASS(1'b0)
);
// ###### Reset logic ###################################
reg [7:0] reset_cnt = 0;
wire resetq = &reset_cnt;
always @(posedge clk, negedge reset_button) begin
if (!reset_button) reset_cnt <= 0;
else reset_cnt <= reset_cnt + !resetq;
end
// ###### Cycle counter #################################
//reg [31:0] cycles;
//always @(posedge clk) cycles <= cycles + 1;
// ###### LEDS ##########################################
reg [4:0] LEDs;
assign {D5,D4,D3,D2,D1} = LEDs;
// ###### RING OSCILLATOR ###############################
wire [1:0] buffers_in, buffers_out;
assign buffers_in = {buffers_out[0:0], ~buffers_out[1]};
SB_LUT4 #(
.LUT_INIT(16'd2)
) buffers [1:0] (
.O(buffers_out),
.I0(buffers_in),
.I1(1'b0),
.I2(1'b0),
.I3(1'b0)
);
wire random = ~buffers_out[1];
// ###### GPIO ##########################################
wire [24:0] port_in;
reg [23:0] port_out;
reg [23:0] port_dir;
assign port_in[24] = random;
// PMOD
SB_IO #(.PIN_TYPE(6'b1010_01)) fio0 (.PACKAGE_PIN(PIO1_02), .D_OUT_0(port_out[ 0]), .D_IN_0(port_in[ 0]), .OUTPUT_ENABLE(port_dir[ 0]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio1 (.PACKAGE_PIN(PIO1_03), .D_OUT_0(port_out[ 1]), .D_IN_0(port_in[ 1]), .OUTPUT_ENABLE(port_dir[ 1]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio2 (.PACKAGE_PIN(PIO1_04), .D_OUT_0(port_out[ 2]), .D_IN_0(port_in[ 2]), .OUTPUT_ENABLE(port_dir[ 2]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio3 (.PACKAGE_PIN(PIO1_05), .D_OUT_0(port_out[ 3]), .D_IN_0(port_in[ 3]), .OUTPUT_ENABLE(port_dir[ 3]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio4 (.PACKAGE_PIN(PIO1_06), .D_OUT_0(port_out[ 4]), .D_IN_0(port_in[ 4]), .OUTPUT_ENABLE(port_dir[ 4]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio5 (.PACKAGE_PIN(PIO1_07), .D_OUT_0(port_out[ 5]), .D_IN_0(port_in[ 5]), .OUTPUT_ENABLE(port_dir[ 5]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio6 (.PACKAGE_PIN(PIO1_08), .D_OUT_0(port_out[ 6]), .D_IN_0(port_in[ 6]), .OUTPUT_ENABLE(port_dir[ 6]));
SB_IO #(.PIN_TYPE(6'b1010_01)) fio7 (.PACKAGE_PIN(PIO1_09), .D_OUT_0(port_out[ 7]), .D_IN_0(port_in[ 7]), .OUTPUT_ENABLE(port_dir[ 7]));
// Header 1
SB_IO #(.PIN_TYPE(6'b1010_01)) gio0 (.PACKAGE_PIN(PIO0_02), .D_OUT_0(port_out[ 8]), .D_IN_0(port_in[ 8]), .OUTPUT_ENABLE(port_dir[ 8]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio1 (.PACKAGE_PIN(PIO0_03), .D_OUT_0(port_out[ 9]), .D_IN_0(port_in[ 9]), .OUTPUT_ENABLE(port_dir[ 9]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio2 (.PACKAGE_PIN(PIO0_04), .D_OUT_0(port_out[10]), .D_IN_0(port_in[10]), .OUTPUT_ENABLE(port_dir[10]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio3 (.PACKAGE_PIN(PIO0_05), .D_OUT_0(port_out[11]), .D_IN_0(port_in[11]), .OUTPUT_ENABLE(port_dir[11]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio4 (.PACKAGE_PIN(PIO0_06), .D_OUT_0(port_out[12]), .D_IN_0(port_in[12]), .OUTPUT_ENABLE(port_dir[12]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio5 (.PACKAGE_PIN(PIO0_07), .D_OUT_0(port_out[13]), .D_IN_0(port_in[13]), .OUTPUT_ENABLE(port_dir[13]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio6 (.PACKAGE_PIN(PIO0_08), .D_OUT_0(port_out[14]), .D_IN_0(port_in[14]), .OUTPUT_ENABLE(port_dir[14]));
SB_IO #(.PIN_TYPE(6'b1010_01)) gio7 (.PACKAGE_PIN(PIO0_09), .D_OUT_0(port_out[15]), .D_IN_0(port_in[15]), .OUTPUT_ENABLE(port_dir[15]));
// Header 2
SB_IO #(.PIN_TYPE(6'b1010_01)) hio0 (.PACKAGE_PIN(PIO2_10), .D_OUT_0(port_out[16]), .D_IN_0(port_in[16]), .OUTPUT_ENABLE(port_dir[16]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio1 (.PACKAGE_PIN(PIO2_11), .D_OUT_0(port_out[17]), .D_IN_0(port_in[17]), .OUTPUT_ENABLE(port_dir[17]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio2 (.PACKAGE_PIN(PIO2_12), .D_OUT_0(port_out[18]), .D_IN_0(port_in[18]), .OUTPUT_ENABLE(port_dir[18]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio3 (.PACKAGE_PIN(PIO2_13), .D_OUT_0(port_out[19]), .D_IN_0(port_in[19]), .OUTPUT_ENABLE(port_dir[19]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio4 (.PACKAGE_PIN(PIO2_14), .D_OUT_0(port_out[20]), .D_IN_0(port_in[20]), .OUTPUT_ENABLE(port_dir[20]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio5 (.PACKAGE_PIN(PIO2_15), .D_OUT_0(port_out[21]), .D_IN_0(port_in[21]), .OUTPUT_ENABLE(port_dir[21]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio6 (.PACKAGE_PIN(PIO2_16), .D_OUT_0(port_out[22]), .D_IN_0(port_in[22]), .OUTPUT_ENABLE(port_dir[22]));
SB_IO #(.PIN_TYPE(6'b1010_01)) hio7 (.PACKAGE_PIN(PIO2_17), .D_OUT_0(port_out[23]), .D_IN_0(port_in[23]), .OUTPUT_ENABLE(port_dir[23]));
// ###### UART ##########################################
wire serial_valid, serial_busy;
wire [7:0] serial_data;
wire serial_wr = io_wstrb & io_word_address[IO_UART_DAT_bit];
wire serial_rd = io_rstrb & io_word_address[IO_UART_DAT_bit];
buart #(
.FREQ_MHZ(48),
.BAUDS(115200)
) the_buart (
.clk(clk),
.resetq(resetq),
.rx(RXD),
.tx(TXD),
.rd(serial_rd),
.wr(serial_wr),
.valid(serial_valid),
.busy(serial_busy),
.tx_data(io_wdata[7:0]),
.rx_data(serial_data)
);
// ###### IO PORTS ######################################
// We got a total of 20 bits for 1-hot addressing of IO registers.
localparam IO_LEDS_bit = 0; // RW four leds
localparam IO_UART_DAT_bit = 1; // RW write: data to send (8 bits) read: received data (8 bits)
localparam IO_UART_CNTL_bit = 2; // R status. bit 8: valid read data. bit 9: busy sending
localparam IO_PORT_IN_bit = 3; // R: GPIO port in
localparam IO_PORT_OUT_bit = 4; // RW: GPIO port out
localparam IO_PORT_DIR_bit = 5; // RW: GPIO port dir
// localparam IO_CYCLES_bit = 6;
assign io_rdata =
(io_word_address[IO_UART_DAT_bit ] ? {22'd0, serial_busy, serial_valid, serial_data} : 32'd0) |
(io_word_address[IO_UART_CNTL_bit] ? {22'd0, serial_busy, serial_valid, serial_data} : 32'd0) |
(io_word_address[IO_PORT_IN_bit] ? port_in : 32'd0) |
(io_word_address[IO_PORT_OUT_bit] ? port_out : 32'd0) |
(io_word_address[IO_PORT_DIR_bit] ? port_dir : 32'd0) |
// (io_word_address[IO_CYCLES_bit] ? cycles : 32'd0) |
(io_word_address[IO_LEDS_bit] ? LEDs : 32'd0);
always @(posedge clk)
begin
if (io_wstrb && io_word_address[IO_LEDS_bit]) LEDs <= io_wdata;
if (io_wstrb && io_word_address[IO_PORT_OUT_bit]) port_out <= io_wdata;
if (io_wstrb && io_word_address[IO_PORT_DIR_bit]) port_dir <= io_wdata;
end
// For now, we got no device that has blocking reads or writes
assign io_rbusy = 0 ;
assign io_wbusy = 0 ;
/***************************************************************************************************
/*
* Memory and memory interface
* memory map:
* address[21:2] RAM word address (4 Mb max).
* address[23:22] 00: RAM
* 01: IO page (1-hot) (starts at 0x400000)
* 10: SPI Flash page (starts at 0x800000)
*/
// The memory bus.
wire [31:0] mem_address; // 24 bits are used internally. The two LSBs are ignored (using word addresses)
wire [3:0] mem_wmask; // mem write mask and strobe /write Legal values are 000,0001,0010,0100,1000,0011,1100,1111
wire [31:0] mem_rdata; // processor <- (mem and peripherals)
wire [31:0] mem_wdata; // processor -> (mem and peripherals)
wire mem_rstrb; // mem read strobe. Goes high to initiate memory read.
wire mem_rbusy; // processor <- (mem and peripherals). Stays high until a read transfer is finished.
wire mem_wbusy; // processor <- (mem and peripherals). Stays high until a write transfer is finished.
wire mem_wstrb = |mem_wmask; // mem write strobe, goes high to initiate memory write (deduced from wmask)
// IO bus.
wire mem_address_is_ram = (mem_address[23:22] == 2'b00);
wire mem_address_is_io = (mem_address[23:22] == 2'b01);
wire mem_address_is_spi_flash = (mem_address[23:22] == 2'b10);
wire mapped_spi_flash_rbusy;
wire [31:0] mapped_spi_flash_rdata;
MappedSPIFlash mapped_spi_flash(
.clk(clk),
.rstrb(mem_rstrb && mem_address_is_spi_flash),
.word_address(mem_address[21:2]),
.rdata(mapped_spi_flash_rdata),
.rbusy(mapped_spi_flash_rbusy),
.CLK(spi_clk),
.CS_N(spi_cs_n),
.IO({spi_miso,spi_mosi})
);
wire [31:0] io_rdata;
wire [31:0] io_wdata = mem_wdata;
wire io_rstrb = mem_rstrb && mem_address_is_io;
wire io_wstrb = mem_wstrb && mem_address_is_io;
wire [19:0] io_word_address = mem_address[21:2]; // word offset in io page
wire io_rbusy;
wire io_wbusy;
assign mem_rbusy = io_rbusy | mapped_spi_flash_rbusy ;
assign mem_wbusy = io_wbusy;
wire [19:0] ram_word_address = mem_address[21:2];
reg [31:0] RAM[(6144/4)-1:0];
reg [31:0] ram_rdata;
always @(posedge clk) begin
if(mem_address_is_ram) begin
if(mem_wmask[0]) RAM[ram_word_address][ 7:0 ] <= mem_wdata[ 7:0 ];
if(mem_wmask[1]) RAM[ram_word_address][15:8 ] <= mem_wdata[15:8 ];
if(mem_wmask[2]) RAM[ram_word_address][23:16] <= mem_wdata[23:16];
if(mem_wmask[3]) RAM[ram_word_address][31:24] <= mem_wdata[31:24];
end
ram_rdata <= RAM[ram_word_address];
end
assign mem_rdata = mem_address_is_io ? io_rdata :
mem_address_is_ram ? ram_rdata :
mapped_spi_flash_rdata;
/****************************************************************/
/* And last but not least, the processor */
FemtoRV32 #(
.ADDR_WIDTH(24)
) processor(
.clk(clk),
.mem_addr(mem_address),
.mem_wdata(mem_wdata),
.mem_wmask(mem_wmask),
.mem_rdata(mem_rdata),
.mem_rstrb(mem_rstrb),
.mem_rbusy(mem_rbusy),
.mem_wbusy(mem_wbusy),
.reset(resetq)
);
endmodule

56
RTL/get_config.v Normal file
View File

@@ -0,0 +1,56 @@
/*
* A dummy IVERILOG module to get some configured variables from
* verilog sources and output them to FIRMWARE/config.mk.
* (see TOOLS/make_config.sh)
*/
`include "femtosoc_config.v"
module dummy();
initial begin
$display("ARCH=",`NRV_ARCH);
$display("OPTIMIZE=",`NRV_OPTIMIZE);
$display("ABI=",`NRV_ABI);
$display("RAM_SIZE=%d",`NRV_RAM);
// Note1: for now we only need FGA here for conditional
// compilation of OLED->FGA emulation (that pulls too
// much code on the IceStick). The rest of the code uses
// hardware config registers to query config and adapt
// dynamically.
// Note2: need to be "-DXXX=1" rather than "-DXXX" because
// the makefile also passes that to the assembler after
// some text substitution, and the assembler needs "=1"
$write("DEVICES=");
`ifdef NRV_IO_FGA
$write(" -DFGA=1");
`endif
`ifdef NRV_IO_SSD1351
$write(" -DSSD1351=1");
`endif
`ifdef NRV_IO_SSD1331
$write(" -DSSD1331=1");
`endif
`ifdef NRV_IO_SDCARD
$write(" -DSDCARD=1");
`endif
`ifdef NRV_IO_MAPPED_SPI_FLASH
$write(" -DSPIFLASH=1");
`endif
`ifdef ICE_STICK
$write(" -DICE_STICK=1");
`endif
`ifdef ICE_BREAKER
$write(" -DICE_BREAKER=1");
`endif
`ifdef ICE_SUGAR_NANO
$write(" -DICE_SUGAR_NANO=1");
`endif
$write("\n");
end
endmodule