newStep.v

2025-11-27 04:28:54 +03:00
parent a84b8fcfde
commit 6e38a6c1af
85 changed files with 25646 additions and 6801 deletions
@@ -0,0 +1,16 @@
+# FemtoRV processor collection
+
+FemtoRV is a collection of small and understandable RISC-V processors.
+
+See this table to choose the most suitable one for your project!
+
+File name                 | ISA            | Special capabilities
+------------------------- | -------------- | --------
+femtorv32_quark.v         | RV32I          | The smallest core in this collection, perfect for tiny FPGAs. For size reasons, it shifts only one bit per clock cycle.
+femtorv32_quark_bicycle.v | RV32I          | The simplest and fastest - in terms of cycles/instruction - core in this collection. Basically Quark with a barrel shifter and additional multiplexers. Recommended if you can afford a few more LUTs and just need a vanilla RV32I.
+femtorv32_tachyon.v       | RV32I          | Quark with execute cycle split in two in order to achieve a higher maximum clock frequency, but at the expense of more cycles per instruction.
+femtorv32_electron.v      | RV32IM         | Featuring barrel shifter, multiplication and division instructions.
+femtorv32_intermissum.v   | RV32IM + IRQ   | Full interrupt support along with CSR registers.
+femtorv32_gracilis.v      | RV32IMC + IRQ  | With compressed instructions support, saves both RAM usage and memory fetch cycles. Recommended as general-purpose processor.
+femtorv32_individua.v     | RV32IMAC + IRQ | Also available with atomic instructions support. Not really necessary in single processor designs, but probably useful if you have tricky interrupt handlers.
+femtorv32_petitbateau.v   | RV32IMFC + IRQ | Floating point!
@@ -0,0 +1,7 @@
+This directory contains several versions of femtorv32, that I'm using
+for testing different features and influence on timings:
+- testdrive_RV32IM:      tachyon core (with two execute cycles) with M extension
+- testdrive_RV32IM_simF: M extension, F decoder and simulated FPU (works only with Verilator)
+- testdrive_RV32IMF:     M and F extensions
+
+I recommend using the other cores instead.
@@ -0,0 +1,479 @@
+/******************************************************************************/
+//     Electron: valid. fmax: 70 MHz  exp. fmax: 80 MHz
+// TestDrive: morphing tachyon into a RV32IMF core, trying to 
+// preserve maxfreq at each step.
+// Step 0: Tachyon       valid. fmax: 115-120 MHz  exp. fmax: 135-140 MHz
+// Step 1: Barrel shft   valid. fmax: 110-115 MHz  exp. fmax: 130-135 MHz
+// Step 2: RV32M         valid. fmax: 105-115 MHz  exp. fmax: 120     MHz 
+
+//           
+/******************************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32im"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-O3"
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output [3:0]  mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000; 
+   parameter ADDR_WIDTH       = 24;           
+
+   localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
+
+
+   // Flip a 32 bit word. Used by the shifter (a single shifter for
+   // left and right shifts, saves silicium !)
+   function [31:0] flip32;
+      input [31:0] x;
+      flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7], 
+		x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15], 
+		x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+		x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
+   endfunction
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction. 
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The destination register
+ wire [4:0] rdId = instr[11:7];
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *) reg  [7:0] funct3Is;
+
+ // Base RISC-V (RV32I) has only 10 different instructions !
+   reg isLoad;
+   reg isALUimm;
+   reg isAUIPC;
+   reg isStore;
+   reg isALUreg;
+   reg isLUI;
+   reg isBranch;
+   reg isJALR;
+   reg isJAL;
+   reg isSYSTEM;
+  
+   reg [31:0] Uimm;
+   reg [31:0] Iimm;   
+   reg [31:0] Simm;   
+   reg [31:0] Bimm;
+   reg [31:0] Jimm;
+   
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit] & !mem_rbusy) begin
+	 isLoad    <=  (mem_rdata[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+	 isALUimm  <=  (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+	 isAUIPC   <=  (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
+	 isStore   <=  (mem_rdata[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+	 isALUreg  <=  (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
+	 isLUI     <=  (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
+	 isBranch  <=  (mem_rdata[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+	 isJALR    <=  (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+	 isJAL     <=  (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+	 isSYSTEM  <=  (mem_rdata[6:2] == 5'b11100); // rd <- cycles
+	 funct3Is  <= 8'b00000001 << mem_rdata[14:12];
+
+	 Uimm <= {    mem_rdata[31],   mem_rdata[30:12], {12{1'b0}}};
+	 Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
+	 Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
+	 Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
+	 Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
+      end 
+   end
+   
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+   
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except shifts.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
+   
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = flip32(shifter);
+   
+   /***************************************************************************/
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] alu_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   // funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
+   //         4->DIV 5->DIVU 6->REM    7->REMU
+   
+   wire [31:0] alu_mul = funct3Is[0] ? multiply[31: 0]   // 0:MUL
+                                     : multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
+
+   wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend) 
+    	                           : (div_sign ? -quotient : quotient);
+   
+
+   wire        aluBusy = |quotient_msk; // ALU is busy if division is in progress.
+   reg [31:0]  aluOut;
+
+   wire funcM     = instr[25];
+   wire isDivide  = instr[14];
+   
+   always @(posedge clk) begin
+      aluOut <=  (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
+   end
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg div_sign;
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [32:0] quotient_msk;
+
+   always @(posedge clk) begin
+      if (aluWr) begin
+	 dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+	 divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+	 quotient <= 0;
+	 quotient_msk[32] <= isALUreg & funcM & isDivide;
+	 div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] : 
+                      (aluIn1[31] ^ aluIn2[31]) & |aluIn2);
+      end else begin
+	 divisor      <= divisor >> 1;
+	 quotient_msk <= quotient_msk >> 1;
+	 if(divisor <= {31'b0, dividend}) begin
+	    quotient <= {quotient[30:0],1'b1};
+	    dividend <= dividend - divisor[31:0];
+	 end else begin
+	    quotient <= {quotient[30:0],1'b0};
+	 end
+      end
+   end
+   
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate_ =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   reg 	predicate;
+   
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   reg [ADDR_WIDTH-1:0]  PCplusImm;
+
+   // A separate adder to compute the destination of load/store.   
+   reg [ADDR_WIDTH-1:0]  loadstore_addr;
+   
+   assign mem_addr = {ADDR_PAD, 
+		       state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? 
+		       PC : loadstore_addr
+		     };
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      /* verilator lint_off WIDTH */	       	       
+      (isSYSTEM            ? cycles               : 32'b0) |  // SYSTEM
+      /* verilator lint_on WIDTH */	       	       	       
+      (isLUI               ? Uimm                 : 32'b0) |  // LUI
+      (isALU               ? aluOut               : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? {ADDR_PAD,PCplusImm} : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? {ADDR_PAD,PCplus4  } : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data            : 32'b0);   // Load
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign = 
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword = 
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+   
+   wire  [7:0] LOAD_byte = 
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  : 
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword 
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte     
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ? 
+	            (loadstore_addr[1] ? 
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001) 
+                    ) :
+	      mem_halfwordAccess ? 
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE1_bit        = 2;
+   localparam EXECUTE2_bit        = 3;   
+   localparam WAIT_ALU_OR_MEM_bit = 4;
+   localparam NB_STATES           = 5;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE1        = 1 << EXECUTE1_bit;
+   localparam EXECUTE2        = 1 << EXECUTE2_bit;   
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+   
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & 
+	            (state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE1_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR  
+   wire needToWait = isLoad | 
+		     isStore  & `NRV_IS_IO_ADDR(mem_addr) | 
+		     aluBusy;
+`else
+   wire needToWait = isLoad | isStore | aluBusy;   
+`endif
+   
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE1;        // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE1_bit]: begin
+	   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+	   // Equivalent to:
+	   //  PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+	   PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] : 
+			       instr[4] ? Uimm[ADDR_WIDTH-1:0] : 
+			                  Bimm[ADDR_WIDTH-1:0] );
+
+	   // testing instr[5] is equivalent to testing isStore in this context.
+	   loadstore_addr <= rs1[ADDR_WIDTH-1:0] + 
+ 		     (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+	   
+	   predicate <= predicate_;
+	   state <= EXECUTE2;
+	end
+	
+        state[EXECUTE2_bit]: begin
+           PC <= isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                 jumpToPCplusImm ? PCplusImm :
+                 PCplus4;
+	   state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+	
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;   
+`else   
+   reg [31:0]  cycles;
+`endif   
+   always @(posedge clk) cycles <= cycles + 1;
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed. 
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this: 
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,689 @@
+/******************************************************************************/
+//     Electron: valid. fmax: 70 MHz  exp. fmax: 80 MHz
+// TestDrive: morphing tachyon into a RV32IMF core, trying to 
+// preserve maxfreq at each step.
+// Step 0: Tachyon            valid. fmax: 115-120 MHz  exp. fmax: 135-140 MHz
+// Step 1: Barrel shft        valid. fmax: 110-115 MHz  exp. fmax: 130-135 MHz
+// Step 2: RV32M              valid. fmax: 105-115 MHz  exp. fmax: 120     MHz 
+// Step 3: RV32F  decod only  valid. fmax: 100-105 MHz  exp. fmax: 105     MHz
+
+//           
+/******************************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32imaf"
+`define NRV_ABI      "ilp32f"
+
+//`define NRV_ARCH     "rv32im"
+//`define NRV_ABI      "ilp32"
+
+`define NRV_OPTIMIZE "-O3"
+
+// Check condition and display message in simulation
+`ifdef BENCH
+ `define ASSERT(cond,msg) if(!(cond)) $display msg
+ `define ASSERT_NOT_REACHED(msg) $display msg
+`else
+ `define ASSERT(cond,msg)
+ `define ASSERT_NOT_REACHED(msg)
+`endif
+
+// FPU Normalization needs to detect the position of the first bit set 
+// in the A_frac register. It is easier to count the number of leading 
+// zeroes (CLZ for Count Leading Zeroes), as follows. See:
+// https://electronics.stackexchange.com/questions/196914/verilog-synthesize-high-speed-leading-zero-count
+module CLZ #(
+   parameter W_IN = 64, // must be power of 2, >= 2
+   parameter W_OUT = $clog2(W_IN)	     
+) (
+   input wire [W_IN-1:0]   in,
+   output wire [W_OUT-1:0] out
+);
+  generate
+     if(W_IN == 2) begin
+	assign out = !in[1];
+     end else begin
+	wire [W_OUT-2:0] half_count;
+	wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
+	wire [W_IN/2-1:0] rhs = in[0      +: W_IN/2];
+	wire left_empty = ~|lhs;
+	CLZ #(
+	  .W_IN(W_IN/2)
+        ) inner(
+           .in(left_empty ? rhs : lhs),
+           .out(half_count)		
+	);
+	assign out = {left_empty, half_count};
+     end
+  endgenerate
+endmodule   
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output [3:0]  mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000; 
+   parameter ADDR_WIDTH       = 24;           
+
+   localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
+
+
+   // Flip a 32 bit word. Used by the shifter (a single shifter for
+   // left and right shifts, saves silicium !)
+   function [31:0] flip32;
+      input [31:0] x;
+      flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7], 
+		x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15], 
+		x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+		x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
+   endfunction
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction. 
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *) reg  [7:0] funct3Is;
+
+ // Instruction decoder and immediate decoder
+ // Base RISC-V (RV32I) has only 10 different instructions !
+   
+   reg isLoad,   isALUimm, isAUIPC, isStore,  isALUreg, isLUI,
+       isBranch, isJALR,   isJAL,   isSYSTEM, isFPU;
+  
+   reg [31:0] Uimm, Iimm, Simm, Bimm, Jimm;
+   reg 	      rdIsNZ; // Asserted if dest. register is non-zero (writeback)
+   
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit]) begin
+	 isLoad    <=  (mem_rdata[6:3] == 4'b0000);  // rd <- mem[rs1+Iimm]
+	 isALUimm  <=  (mem_rdata[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+	 isAUIPC   <=  (mem_rdata[6:2] == 5'b00101); // rd <- PC + Uimm
+	 isStore   <=  (mem_rdata[6:3] == 4'b0100);  // mem[rs1+Simm] <- rs2
+	 isALUreg  <=  (mem_rdata[6:2] == 5'b01100); // rd <- rs1 OP rs2
+	 isLUI     <=  (mem_rdata[6:2] == 5'b01101); // rd <- Uimm
+	 isBranch  <=  (mem_rdata[6:2] == 5'b11000); // if(rs1OPrs2) PC<-PC+Bimm
+	 isJALR    <=  (mem_rdata[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+	 isJAL     <=  (mem_rdata[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+	 isSYSTEM  <=  (mem_rdata[6:2] == 5'b11100); // rd <- cycles
+	 isFPU     <=  (mem_rdata[6:5] == 2'b10);    // all FPU except FLW/FSW 
+	 funct3Is  <= 8'b00000001 << mem_rdata[14:12];
+
+	 Uimm <= {    mem_rdata[31],   mem_rdata[30:12], {12{1'b0}}};
+	 Iimm <= {{21{mem_rdata[31]}}, mem_rdata[30:20]};
+	 Simm <= {{21{mem_rdata[31]}}, mem_rdata[30:25],mem_rdata[11:7]};
+	 Bimm <= {{20{mem_rdata[31]}}, mem_rdata[7],mem_rdata[30:25],mem_rdata[11:8],1'b0};
+	 Jimm <= {{12{mem_rdata[31]}}, mem_rdata[19:12],mem_rdata[20],mem_rdata[30:21],1'b0};
+
+	 rdIsNZ <= |mem_rdata[11:7];
+      end 
+   end
+   
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] rs3; // this one is used by the FMA instructions.
+   
+   reg [31:0] registerFile [0:63]; //  0..31: integer registers
+                                   // 32..63: floating-point registers
+   
+   /***************************************************************************/
+   // The FPU 
+   /***************************************************************************/
+
+   // instruction decoder
+
+   reg isFMADD, isFMSUB,  isFNMSUB, isFNMADD,  isFADD,   isFSUB, isFMUL, isFDIV,
+       isFSQRT, isFSGNJ,  isFSGNJN, isFSGNJX,  isFMIN,   isFMAX, isFEQ,  isFLT,
+       isFLE,   isFCLASS, isFCVTWS, isFCVTWUS, isFCVTSW, isFCVTSWU, isFMVXW,
+       isFMVWX;
+   
+   reg rdIsFP; // Asserted if destination register is a FP register.
+
+   // rs1 is a FP register if instr[6:5] = 2'b10 except for:
+   //   FCVT.S.W{U}:  instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
+   //   FMV.W.X    :  instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire rs1IsFP = (mem_rdata[6:5]   == 2'b10 ) &&  
+                     !((mem_rdata[4:2]  == 3'b100) && (
+                      (mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
+     	              (mem_rdata[31:28] == 4'b1111)    // FMV.W.X
+                    )						    
+		  );
+
+   // rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire rs2IsFP = (mem_rdata[6:5] == 2'b10) || (mem_rdata[6:2]==5'b01001);
+
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit]) begin
+	 isFMADD   <= (mem_rdata[4:2] == 3'b000); 
+	 isFMSUB   <= (mem_rdata[4:2] == 3'b001); 
+	 isFNMSUB  <= (mem_rdata[4:2] == 3'b010); 
+	 isFNMADD  <= (mem_rdata[4:2] == 3'b011);
+	 
+	 isFADD    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00000));
+	 isFSUB    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00001));
+	 isFMUL    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00010));
+	 isFDIV    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00011));
+	 isFSQRT   <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b01011));
+	 
+	 isFSGNJ   <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b00));
+	 isFSGNJN  <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b01));      
+	 isFSGNJX  <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00100) && (mem_rdata[13:12] == 2'b10));   
+	 
+	 isFMIN    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) && !mem_rdata[12]);
+	 isFMAX    <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b00101) &&  mem_rdata[12]);      
+	 
+	 isFEQ     <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b10));
+	 isFLT     <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b01));
+	 isFLE     <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b10100) && (mem_rdata[13:12] == 2'b00));                        
+	 
+	 isFCLASS  <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) &&  mem_rdata[12]); 
+   
+	 isFCVTWS  <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) && !mem_rdata[20]);
+	 isFCVTWUS <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11000) &&  mem_rdata[20]);
+
+	 isFCVTSW  <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) && !mem_rdata[20]);
+	 isFCVTSWU <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11010) &&  mem_rdata[20]);
+	 
+	 isFMVXW   <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11100) && !mem_rdata[12]);
+	 isFMVWX   <= (mem_rdata[4] && (mem_rdata[31:27] == 5'b11110));
+
+	 rdIsFP <= (mem_rdata[6:2] == 5'b00001)             || // FLW
+	           (mem_rdata[6:4] == 3'b100  )             || // F{N}MADD,F{N}MSUB
+	           (mem_rdata[6:4] == 3'b101 && (
+                              (mem_rdata[31]    == 1'b0)    || // R-Type FPU
+		  	      (mem_rdata[31:28] == 4'b1101) || // FCVT.S.W{U}
+			      (mem_rdata[31:28] == 4'b1111)    // FMV.W.X 
+		   )
+               );
+      end
+   end   
+
+   reg [31:0] fpuOut;
+`define FPU_OUT fpuOut
+   wire       fpuBusy = 0;
+   
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit]) begin
+	 // Fetch registers as soon as instruction is ready.
+	 rs1 <= registerFile[{rs1IsFP,mem_rdata[19:15]}]; 
+	 rs2 <= registerFile[{rs2IsFP,mem_rdata[24:20]}];
+	 rs3 <= registerFile[{1'b1, mem_rdata[31:27]}];
+      end else if(state[EXECUTE2_bit] & isFPU) begin
+`ifdef VERILATOR
+	 (* parallel_case *)
+	 case(1'b1)
+	   isFMADD  : `FPU_OUT <= $c32("FMADD(",rs1,",",rs2,",",rs3,")");
+	   isFMSUB  : `FPU_OUT <= $c32("FMSUB(",rs1,",",rs2,",",rs3,")");
+	   isFNMSUB : `FPU_OUT <= $c32("FNMSUB(",rs1,",",rs2,",",rs3,")");
+	   isFNMADD : `FPU_OUT <= $c32("FNMADD(",rs1,",",rs2,",",rs3,")");
+  
+	   isFMUL   : `FPU_OUT <= $c32("FMUL(",rs1,",",rs2,")");
+	   isFADD   : `FPU_OUT <= $c32("FADD(",rs1,",",rs2,")");
+	   isFSUB   : `FPU_OUT <= $c32("FSUB(",rs1,",",rs2,")");
+	   
+	   isFDIV   : `FPU_OUT <= $c32("FDIV(",rs1,",",rs2,")");
+	   isFSQRT  : `FPU_OUT <= $c32("FSQRT(",rs1,")");
+
+	   
+	   isFSGNJ  : `FPU_OUT <= $c32("FSGNJ(",rs1,",",rs2,")");
+	   isFSGNJN : `FPU_OUT <= $c32("FSGNJN(",rs1,",",rs2,")");
+	   isFSGNJX : `FPU_OUT <= $c32("FSGNJX(",rs1,",",rs2,")");
+	   
+	   isFMIN   : `FPU_OUT <= $c32("FMIN(",rs1,",",rs2,")");
+	   isFMAX   : `FPU_OUT <= $c32("FMAX(",rs1,",",rs2,")");
+	   
+	   isFEQ    : `FPU_OUT <= $c32("FEQ(",rs1,",",rs2,")");
+	   isFLE    : `FPU_OUT <= $c32("FLE(",rs1,",",rs2,")");
+	   isFLT    : `FPU_OUT <= $c32("FLT(",rs1,",",rs2,")");
+	   
+	   isFCLASS : `FPU_OUT <= $c32("FCLASS(",rs1,")") ;
+	   
+	   isFCVTWS : `FPU_OUT <= $c32("FCVTWS(",rs1,")");
+	   isFCVTWUS: `FPU_OUT <= $c32("FCVTWUS(",rs1,")");
+	   
+	   isFCVTSW : `FPU_OUT <= $c32("FCVTSW(",rs1,")");
+	   isFCVTSWU: `FPU_OUT <= $c32("FCVTSWU(",rs1,")");
+	   
+           isFMVXW:   `FPU_OUT <= rs1;
+	   isFMVWX:   `FPU_OUT <= rs1;	   
+	 endcase 
+`endif
+	 
+      // register write-back
+      end else if( 
+	      !(isBranch | isStore) & (rdIsFP | rdIsNZ) & 
+	      (state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]) 
+      ) begin 
+	 registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
+      end
+   end
+
+   
+`ifdef VERILATOR
+   // When doing simulations, compare the result of all operations with
+   // what's computed on the host CPU. 
+
+   reg [31:0] z;
+   reg [31:0] rs1_bkp;
+   reg [31:0] rs2_bkp;
+   reg [31:0] rs3_bkp;   
+
+   always @(posedge clk) begin
+      // Some micro-coded instructions (FDIV/FSQRT) use rs1, rs2 and
+      // rs3 as temporaty registers, so we need to save them to be able
+      // to recompute the operation on the host CPU.
+      if(isFPU && state[EXECUTE2_bit]) begin
+	 rs1_bkp <= rs1;
+	 rs2_bkp <= rs2;
+	 rs3_bkp <= rs3;
+      end
+      
+      if(
+	 isFPU && state[WAIT_ALU_OR_MEM_bit] // && fpmi_PC == 0
+      ) begin
+	 case(1'b1)
+	   isFMUL: z <= $c32("CHECK_FMUL(",fpuOut,",",rs1,",",rs2,")");
+	   isFADD: z <= $c32("CHECK_FADD(",fpuOut,",",rs1,",",rs2,")");
+	   isFSUB: z <= $c32("CHECK_FSUB(",fpuOut,",",rs1,",",rs2,")");
+	   
+	   // my FDIV and FSQRT are not IEEE754 compliant ! 
+	   // (checks commented-out for now)
+	   // Note: checks use rs1_bkp and rs2_bkp because
+	   //  FDIV and FSQRT overwrite rs1 and rs2
+	   //
+           //isFDIV:  
+	   // z<=$c32("CHECK_FDIV(",fpuOut,",",rs1_bkp,",",rs2_bkp,")");
+           //isFSQRT: 
+	   // z<=$c32("CHECK_FSQRT(",fpuOut,",",rs1_bkp,")");
+
+	   
+	   isFMADD :
+	   z<=$c32("CHECK_FMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
+	   
+	   isFMSUB :
+	   z<=$c32("CHECK_FMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
+	   
+	   isFNMSUB:
+	   z<=$c32("CHECK_FNMSUB(",fpuOut,",",rs1,",",rs2,",",rs3,")");
+	   
+	   isFNMADD:
+	   z<=$c32("CHECK_FNMADD(",fpuOut,",",rs1,",",rs2,",",rs3,")");
+
+	   isFEQ: z <= $c32("CHECK_FEQ(",fpuOut,",",rs1,",",rs2,")");
+	   isFLT: z <= $c32("CHECK_FLT(",fpuOut,",",rs1,",",rs2,")");
+	   isFLE: z <= $c32("CHECK_FLE(",fpuOut,",",rs1,",",rs2,")");
+
+	   isFCVTWS : z <= $c32("CHECK_FCVTWS(",fpuOut,",",rs1,")");
+	   isFCVTWUS: z <= $c32("CHECK_FCVTWUS(",fpuOut,",",rs1,")");
+
+	   isFCVTSW : z <= $c32("CHECK_FCVTSW(",fpuOut,",",rs1,")");
+	   isFCVTSWU: z <= $c32("CHECK_FCVTSWU(",fpuOut,",",rs1,")");
+
+	   isFMIN: z <= $c32("CHECK_FMIN(",fpuOut,",",rs1,",",rs2,")");
+	   isFMAX: z <= $c32("CHECK_FMAX(",fpuOut,",",rs1,",",rs2,")");
+	   
+	 endcase
+      end
+   end 
+   
+`endif
+   
+   
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except DIV
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
+   
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = flip32(shifter);
+   
+   /***************************************************************************/
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] alu_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   // funct3: 0->MUL 1->MULH 2->MULHSU 3->MULHU
+   //         4->DIV 5->DIVU 6->REM    7->REMU
+   
+   wire [31:0] alu_mul = funct3Is[0] 
+                               ? multiply[31: 0]   // 0:MUL
+                               : multiply[63:32] ; // 1:MULH, 2:MULHSU, 3:MULHU
+
+   wire [31:0] alu_div = instr[13] ? (div_sign ? -dividend : dividend) 
+    	                           : (div_sign ? -quotient : quotient);
+   
+
+   wire        aluBusy = |quotient_msk; // ALU is busy if division in progress.
+   reg [31:0]  aluOut;
+
+   wire funcM     = instr[25];
+   wire isDivide  = instr[14];
+   
+   always @(posedge clk) begin
+      aluOut <=  (isALUreg & funcM) ? (isDivide ? alu_div : alu_mul) : alu_base;
+   end
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg div_sign;
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [32:0] quotient_msk;
+
+   always @(posedge clk) begin
+      if (aluWr) begin
+	 dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+	 divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+	 quotient <= 0;
+	 quotient_msk[32] <= isALUreg & funcM & isDivide;
+	 div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] : 
+                      (aluIn1[31] ^ aluIn2[31]) & |aluIn2);
+      end else begin
+	 divisor      <= divisor >> 1;
+	 quotient_msk <= quotient_msk >> 1;
+	 if(divisor <= {31'b0, dividend}) begin
+	    quotient <= {quotient[30:0],1'b1};
+	    dividend <= dividend - divisor[31:0];
+	 end else begin
+	    quotient <= {quotient[30:0],1'b0};
+	 end
+      end
+   end
+   
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate_ =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   reg 	predicate;
+   
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   reg [ADDR_WIDTH-1:0]  PCplusImm;
+
+   // A separate adder to compute the destination of load/store.   
+   reg [ADDR_WIDTH-1:0]  loadstore_addr;
+   
+   assign mem_addr = {ADDR_PAD, 
+		       state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? 
+		       PC : loadstore_addr
+		     };
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      /* verilator lint_off WIDTH */	       	       
+      (isSYSTEM            ? cycles               : 32'b0) |  // SYSTEM
+      /* verilator lint_on WIDTH */	       	       	       
+      (isLUI               ? Uimm                 : 32'b0) |  // LUI
+      (isALU               ? aluOut               : 32'b0) |  // ALUreg, ALUimm
+      (isFPU               ? fpuOut               : 32'b0) |  // FPU	       
+      (isAUIPC             ? {ADDR_PAD,PCplusImm} : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? {ADDR_PAD,PCplus4  } : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data            : 32'b0);   // Load
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word (=instr[13:12])
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+   // - instr[2] is set for FLW and FSW. 
+   wire mem_byteAccess     = !instr[2] && (instr[13:12] == 2'b00); 
+   wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01); 
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign = 
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword = 
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+   
+   wire  [7:0] LOAD_byte = 
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  : 
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword 
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte     
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ? 
+	            (loadstore_addr[1] ? 
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001) 
+                    ) :
+	      mem_halfwordAccess ? 
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE1_bit        = 2;
+   localparam EXECUTE2_bit        = 3;   
+   localparam WAIT_ALU_OR_MEM_bit = 4;
+   localparam NB_STATES           = 5;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE1        = 1 << EXECUTE1_bit;
+   localparam EXECUTE2        = 1 << EXECUTE2_bit;   
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+   
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE1_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR  
+   wire needToWait = isLoad | 
+		     isStore  & `NRV_IS_IO_ADDR(mem_addr) | 
+		     aluBusy | isFPU;
+`else
+   wire needToWait = isLoad | isStore | aluBusy | isFPU;   
+`endif
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored 
+              state <= EXECUTE1;        // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE1_bit]: begin
+	   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+	   // Equivalent to:
+	   //  PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+	   PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] : 
+			       instr[4] ? Uimm[ADDR_WIDTH-1:0] : 
+			                  Bimm[ADDR_WIDTH-1:0] );
+
+	   // testing instr[5] is equivalent to testing isStore in this context.
+	   loadstore_addr <= rs1[ADDR_WIDTH-1:0] + 
+ 		     (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+	   
+	   predicate <= predicate_;
+	   state <= EXECUTE2;
+	end
+	
+        state[EXECUTE2_bit]: begin
+           PC <= isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                 jumpToPCplusImm ? PCplusImm :
+                 PCplus4;
+	   state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
+	      state <= FETCH_INSTR;
+	   end
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+	
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;   
+`else   
+   reg [31:0]  cycles;
+`endif   
+   always @(posedge clk) cycles <= cycles + 1;
+
+endmodule
+
+/*****************************************************************************/
@@ -0,0 +1,452 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: The "electron", with RV32IM support.
+//             A single VERILOG file, compact & understandable code.
+//
+// Instruction set: RV32IM 
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32im"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-O3"
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   // The destination register
+   wire [4:0] rdId = instr[11:7];
+
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *)
+   wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except division.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5], 
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11], 
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17], 
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29], 
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4], 
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9], 
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14], 
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19], 
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24], 
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29], 
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide  = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
+   wire aluBusy   = |quotient_msk; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   wire [31:0] aluOut_muldiv =
+     (  funct3Is[0]   ?  multiply[31: 0] : 32'b0) | // 0:MUL
+     ( |funct3Is[3:1] ?  multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
+     (  instr[14]     ?  div_sign ? -divResult : divResult : 32'b0) ; 
+                                                 // 4:DIV, 5:DIVU, 6:REM, 7:REMU
+   
+   wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [31:0] quotient_msk;
+
+   wire divstep_do = divisor <= {31'b0, dividend};
+
+   wire [31:0] dividendN     = divstep_do ? dividend - divisor[31:0] : dividend;
+   wire [31:0] quotientN     = divstep_do ? quotient | quotient_msk  : quotient;
+
+   wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] : 
+                    (aluIn1[31] != aluIn2[31]) & |aluIn2);
+
+   always @(posedge clk) begin
+      if (isDivide & aluWr) begin
+	 dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+	 divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+	 quotient <= 0;
+	 quotient_msk <= 1 << 31;
+      end else begin
+	 dividend     <= dividendN;
+	 divisor      <= divisor >> 1;
+	 quotient     <= quotientN;
+	 quotient_msk <= quotient_msk >> 1;
+      end
+   end
+      
+   reg  [31:0] divResult;
+   always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   // internal address registers and cycles counter may have less than 
+   // 32 bits, so we deactivate width test for mem_addr and writeBackData
+
+   assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                     PC : loadstore_addr;
+
+   /***************************************************************************/
+   // Counter.
+   /***************************************************************************/
+
+   reg  [63:0]           cycles;  // Cycle counter
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+   wire [31:0] CSR_read = sel_cyclesh ? cycles[63:32] : cycles[31:0];
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read  : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4   : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data : 32'b0) ;  // Load
+
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+                             loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE_bit         = 2;
+   localparam WAIT_ALU_OR_MEM_bit = 3;
+   localparam NB_STATES           = 4;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE         = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) &
+                    (state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+   wire needToWait = isLoad | isStore | isDivide;
+
+   wire [ADDR_WIDTH-1:0] PC_new = 
+			 isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                         jumpToPCplusImm  ? PCplusImm :
+                         PCplus4;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE;         // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE_bit]: begin
+           PC <= PC_new;
+           state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+
+      endcase
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,674 @@
+/******************************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: The "Gracilis", with full interrupt and
+//               RVC compressed instructions support.
+//             A single VERILOG file, compact & understandable code.
+//
+// Instruction set: RV32IMC + CSR + MRET
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/******************************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32imac"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-O3"
+`define NRV_INTERRUPTS
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         interrupt_request,
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   // The destination register
+   wire [4:0] rdId = instr[11:7];
+
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *)
+   wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except divisions.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5], 
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11], 
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17], 
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29], 
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4], 
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9], 
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14], 
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19], 
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24], 
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29], 
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide = isALUreg & funcM & instr[14];
+   wire aluBusy   = |quotient_msk; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   wire [31:0] aluOut_muldiv =
+     (  funct3Is[0]   ?  multiply[31: 0] : 32'b0) | // 0:MUL
+     ( |funct3Is[3:1] ?  multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
+     (  instr[14]     ?  div_sign ? -divResult : divResult : 32'b0) ; 
+                                                 // 4:DIV, 5:DIVU, 6:REM, 7:REMU
+
+   wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [31:0] quotient_msk;
+
+   wire divstep_do = (divisor <= {31'b0, dividend});
+
+   wire [31:0] dividendN     = divstep_do ? dividend - divisor[31:0] : dividend;
+   wire [31:0] quotientN     = divstep_do ? quotient | quotient_msk  : quotient;
+
+   wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] : 
+                                          (aluIn1[31] != aluIn2[31]) & |aluIn2);
+
+   always @(posedge clk) begin
+      if (isDivide & aluWr) begin
+         dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+         divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+         quotient <= 0;
+         quotient_msk <= 1 << 31;
+      end else begin
+         dividend     <= dividendN;
+         divisor      <= divisor >> 1;
+         quotient     <= quotientN;
+         quotient_msk <= quotient_msk >> 1;
+      end
+   end 
+   
+   reg  [31:0] divResult;
+   always @(posedge clk) begin
+      divResult <= instr[13] ? dividendN : quotientN;
+   end
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+   wire [ADDR_WIDTH-1:0] PCinc   = long_instr ? PCplus4 : PCplus2;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   assign mem_addr =   state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                       fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
+                                         : {PC     [ADDR_WIDTH-1:2], 2'b00}
+                       : loadstore_addr  ;
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // Interrupt logic, CSR registers and opcodes.
+   /***************************************************************************/
+
+   // Remember interrupt requests as they are not checked for every cycle
+   reg  interrupt_request_sticky;
+   
+   // Interrupt enable and lock logic
+   wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
+
+   // Processor accepts interrupts in EXECUTE state.   
+   wire interrupt_accepted = interrupt & state[EXECUTE_bit];        
+
+   // If current interrupt is accepted, there already might be the next one,
+   //  which should not be missed:
+   always @(posedge clk) begin
+     interrupt_request_sticky <= 
+         interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
+   end
+
+   // Decoder for mret opcode
+   wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
+
+   // CSRs:
+   reg  [ADDR_WIDTH-1:0] mepc;    // The saved program counter.
+   reg  [ADDR_WIDTH-1:0] mtvec;   // The address of the interrupt handler.
+   reg                   mstatus; // Interrupt enable
+   reg                   mcause;  // Interrupt cause (and lock)
+   reg  [63:0]           cycles;  // Cycle counter
+
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_mstatus = (instr[31:20] == 12'h300);
+   wire sel_mtvec   = (instr[31:20] == 12'h305);
+   wire sel_mepc    = (instr[31:20] == 12'h341);
+   wire sel_mcause  = (instr[31:20] == 12'h342);
+   wire sel_cycles  = (instr[31:20] == 12'hC00);
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+
+   // Read CSRs
+   /* verilator lint_off WIDTH */
+   wire [31:0] CSR_read =
+     (sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
+     (sel_mtvec   ? mtvec                  : 32'b0) |
+     (sel_mepc    ? mepc                   : 32'b0) |
+     (sel_mcause  ? {mcause, 31'b0}        : 32'b0) |
+     (sel_cycles  ? cycles[31:0]           : 32'b0) |
+     (sel_cyclesh ? cycles[63:32]          : 32'b0) ;
+   /* verilator lint_on WIDTH */
+
+   // Write CSRs: 5 bit unsigned immediate or content of RS1
+   wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1; 
+
+   wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read  :
+                           (instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
+                        /* (instr[13:12] == 2'b01) ? */  CSR_modifier ;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+	 mstatus <= 0;
+      end else begin
+	 // Execute a CSR opcode
+	 if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
+	    if (sel_mstatus) mstatus <= CSR_write[3];
+	    if (sel_mtvec  ) mtvec   <= CSR_write[ADDR_WIDTH-1:0];
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read  : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCinc     : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data : 32'b0);   // Load
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+                             loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /***************************************************************************/
+   // Unaligned fetch mechanism and compressed opcode handling
+   /***************************************************************************/
+
+   reg [ADDR_WIDTH-1:2] cached_addr;
+   reg           [31:0] cached_data;
+
+   wire current_cache_hit = cached_addr == PC     [ADDR_WIDTH-1:2];
+   wire    next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
+
+   wire current_unaligned_long = &cached_mem [17:16] & PC    [1];
+   wire    next_unaligned_long = &cached_data[17:16] & PC_new[1];
+
+   reg fetch_second_half;
+   reg long_instr;
+
+   wire [31:0] cached_mem   = current_cache_hit ? cached_data : mem_rdata;
+   wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]} 
+                                    : cached_mem;
+   wire [31:0] decompressed;
+
+   decompressor _decomp ( .c(decomp_input), .d(decompressed) );
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit          = 0;
+   localparam WAIT_INSTR_bit           = 1;
+   localparam EXECUTE_bit              = 2;
+   localparam WAIT_ALU_OR_MEM_bit      = 3;
+   localparam WAIT_ALU_OR_MEM_SKIP_bit = 4;
+
+   localparam NB_STATES                = 5;
+
+   localparam FETCH_INSTR          = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR           = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE              = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM      = 1 << WAIT_ALU_OR_MEM_bit;
+   localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & (
+            state[EXECUTE_bit] | 
+	    state[WAIT_ALU_OR_MEM_bit] | 
+            state[WAIT_ALU_OR_MEM_SKIP_bit]
+   );
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (divide) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+   wire needToWait = isLoad | isStore | isDivide;
+
+   wire [ADDR_WIDTH-1:0] PC_new = 
+           isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+           jumpToPCplusImm  ? PCplusImm :
+           interrupt_return ? mepc :
+                              PCinc;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state             <= WAIT_ALU_OR_MEM;     //Just waiting for !mem_wbusy
+         PC                <= RESET_ADDR[ADDR_WIDTH-1:0];
+         mcause            <= 0;
+         cached_addr       <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
+         fetch_second_half <= 0;
+      end else begin
+
+	 // See note [1] at the end of this file.
+	 (* parallel_case *)
+	 case(1'b1)
+
+           state[WAIT_INSTR_bit]: begin
+              if(!mem_rbusy) begin // may be high when executing from SPI flash
+		 // Update cache
+		 if (~current_cache_hit | fetch_second_half) begin
+                    cached_addr <= mem_addr[ADDR_WIDTH-1:2];
+                    cached_data <= mem_rdata;
+		 end;
+
+		 // Decode instruction
+		 rs1 <= registerFile[decompressed[19:15]];
+		 rs2 <= registerFile[decompressed[24:20]];
+		 instr      <= decompressed[31:2];
+		 long_instr <= &decomp_input[1:0];
+
+		 // Long opcode, unaligned, first part fetched, 
+		 // happens in non-linear code
+		 if (current_unaligned_long & ~fetch_second_half) begin
+                    fetch_second_half <= 1;
+                    state <= FETCH_INSTR;
+		 end else begin
+                    fetch_second_half <= 0;
+                    state <= EXECUTE;
+		 end
+              end
+           end
+
+           state[EXECUTE_bit]: begin
+              if (interrupt) begin
+		 PC     <= mtvec;
+		 mepc   <= PC_new;
+		 mcause <= 1;
+		 state  <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+              end else begin
+		 PC <= PC_new;
+		 if (interrupt_return) mcause <= 0;
+
+		 state <= next_cache_hit & ~next_unaligned_long
+  		        ? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
+			: (needToWait ? WAIT_ALU_OR_MEM      : FETCH_INSTR);
+
+		 fetch_second_half <= next_cache_hit & next_unaligned_long;
+              end
+           end
+
+           state[WAIT_ALU_OR_MEM_bit]: begin
+              if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+           end
+
+           state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
+              if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= WAIT_INSTR;
+           end
+
+           default: begin // FETCH_INSTR
+              state <= WAIT_INSTR;
+           end
+	 endcase 
+      end
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+
+// if c[15:0] is a compressed instrution, decompresses it in d
+// else copies c to d
+module decompressor(
+   input  wire [31:0] c,
+   output reg  [31:0] d
+);
+
+   // How to handle illegal and unknown opcodes
+
+   localparam illegal = 32'h00000000;
+   localparam unknown = 32'h00000000;
+
+   // Register decoder
+
+   wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
+   wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
+
+   wire [4:0] rwl  = c[ 6:2];  // Register wide low
+   wire [4:0] rwh  = c[11:7];  // Register wide high
+
+   localparam x0 = 5'b00000;
+   localparam x1 = 5'b00001;
+   localparam x2 = 5'b00010;   
+
+   // Immediate decoder
+
+   wire  [4:0]    shiftImm = c[6:2];
+
+   wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
+   wire [11:0]     lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
+   wire [11:0]     lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
+   wire [11:0]     swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
+
+   wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
+   wire [11:0]      addImm = {{ 7{c[12]}}, c[6:2]};
+
+   /* verilator lint_off UNUSED */
+   wire [12:0]        bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
+   wire [20:0]      jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
+   wire [31:0]      luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
+   /* verilator lint_on UNUSED */
+
+   always @*
+   casez (c[15:0])
+                                                     // imm / funct7   +   rs2  rs1     fn3                   rd    opcode
+      16'b???___????????_???_11 : d =                                                                            c  ; // Long opcode, no need to decompress
+
+/* verilator lint_off CASEOVERLAP */
+     
+      16'b000___00000000_000_00 : d =                                                                       illegal ; // c.illegal   -->  illegal
+      16'b000___????????_???_00 : d = {      addi4spnImm,             x2, 3'b000,                 rcl, 7'b00100_11} ; // c.addi4spn  -->  addi rd', x2, nzuimm[9:2]
+/* verilator lint_on CASEOVERLAP */
+     
+      16'b010_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00000_11} ; // c.lw        -->  lw   rd', offset[6:2](rs1')
+      16'b110_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01000_11} ; // c.sw        -->  sw   rs2', offset[6:2](rs1')
+
+      16'b000_???_???_??_???_01 : d = {           addImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi      -->  addi rd, rd, nzimm[5:0]
+      16'b001____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x1, 7'b11011_11} ; // c.jal       -->  jal  x1, offset[11:1]
+      16'b010__?_?????_?????_01 : d = {           addImm,             x0, 3'b000,                 rwh, 7'b00100_11} ; // c.li        -->  addi rd, x0, imm[5:0]
+      16'b011__?_00010_?????_01 : d = {      addi16spImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi16sp  -->  addi x2, x2, nzimm[9:4]
+      16'b011__?_?????_?????_01 : d = {    luiImm[31:12],                                         rwh, 7'b01101_11} ; // c.lui       -->  lui  rd, nzuimm[17:12]
+      16'b100_?_00_???_?????_01 : d = {       7'b0000000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srli      -->  srli rd', rd', shamt[5:0]
+      16'b100_?_01_???_?????_01 : d = {       7'b0100000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srai      -->  srai rd', rd', shamt[5:0]
+      16'b100_?_10_???_?????_01 : d = {           addImm,            rch, 3'b111,                 rch, 7'b00100_11} ; // c.andi      -->  andi rd', rd', imm[5:0]
+      16'b100_011_???_00_???_01 : d = {       7'b0100000,       rcl, rch, 3'b000,                 rch, 7'b01100_11} ; // c.sub       -->  sub  rd', rd', rs2'
+      16'b100_011_???_01_???_01 : d = {       7'b0000000,       rcl, rch, 3'b100,                 rch, 7'b01100_11} ; // c.xor       -->  xor  rd', rd', rs2'
+      16'b100_011_???_10_???_01 : d = {       7'b0000000,       rcl, rch, 3'b110,                 rch, 7'b01100_11} ; // c.or        -->  or   rd', rd', rs2'
+      16'b100_011_???_11_???_01 : d = {       7'b0000000,       rcl, rch, 3'b111,                 rch, 7'b01100_11} ; // c.and       -->  and  rd', rd', rs2'
+      16'b101____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x0, 7'b11011_11} ; // c.j         -->  jal  x0, offset[11:1]
+      16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz      -->  beq  rs1', x0, offset[8:1]
+      16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez      -->  bne  rs1', x0, offset[8:1]
+
+      16'b000__?_?????_?????_10 : d = {        7'b0000000, shiftImm, rwh, 3'b001,                 rwh, 7'b00100_11} ; // c.slli      -->  slli rd, rd, shamt[5:0]
+      16'b010__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00000_11} ; // c.lwsp      -->  lw   rd, offset[7:2](x2)
+      16'b100__0_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x0, 7'b11001_11} ; // c.jr        -->  jalr x0, rs1, 0
+      16'b100__0_?????_?????_10 : d = {        7'b0000000,      rwl,  x0, 3'b000,                 rwh, 7'b01100_11} ; // c.mv        -->  add  rd, x0, rs2
+   // 16'b100__1_00000_00000_10 : d = {                              25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak    -->  ebreak
+      16'b100__1_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x1, 7'b11001_11} ; // c.jalr      -->  jalr x1, rs1, 0
+      16'b100__1_?????_?????_10 : d = {        7'b0000000,      rwl, rwh, 3'b000,                 rwh, 7'b01100_11} ; // c.add       -->  add  rd, rd, rs2
+      16'b110__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01000_11} ; // c.swsp      -->  sw   rs2, offset[7:2](x2)
+
+      default:                    d =                                                                       unknown ; // Unknown opcode
+   endcase
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
@@ -0,0 +1,730 @@
+/******************************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: The "Individua", with full interrupt, atomic and
+//               RVC compressed instructions support.
+//             A single VERILOG file, compact & understandable code.
+//
+// Instruction set: RV32IMAC + CSR + MRET
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/******************************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32imac"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-O3"
+`define NRV_INTERRUPTS
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         interrupt_request,
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   // The destination register
+   wire [4:0] rdId = instr[11:7];
+
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *)
+   wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isAMO     =  (instr[6:2] == 5'b01011); // various
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except divisions.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = isAMO ? mem_rdata : rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isAMO | isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter =
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide = isALUreg & funcM & instr[14];
+   wire aluBusy   = |quotient_msk; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   wire [31:0] aluOut_muldiv =
+     (  funct3Is[0]   ?  multiply[31: 0] : 32'b0) | // 0:MUL
+     ( |funct3Is[3:1] ?  multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
+     (  instr[14]     ?  div_sign ? -divResult : divResult : 32'b0) ;
+                                                 // 4:DIV, 5:DIVU, 6:REM, 7:REMU
+
+   wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [31:0] quotient_msk;
+
+   wire divstep_do = (divisor <= {31'b0, dividend});
+
+   wire [31:0] dividendN     = divstep_do ? dividend - divisor[31:0] : dividend;
+   wire [31:0] quotientN     = divstep_do ? quotient | quotient_msk  : quotient;
+
+   wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] :
+                                          (aluIn1[31] != aluIn2[31]) & |aluIn2);
+
+   always @(posedge clk) begin
+      if (isDivide & aluWr) begin
+         dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+         divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+         quotient <= 0;
+         quotient_msk <= 1 << 31;
+      end else begin
+         dividend     <= dividendN;
+         divisor      <= divisor >> 1;
+         quotient     <= quotientN;
+         quotient_msk <= quotient_msk >> 1;
+      end
+   end
+
+   reg  [31:0] divResult;
+   always @(posedge clk) begin
+      divResult <= instr[13] ? dividendN : quotientN;
+   end
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Special ALU for atomic opcodes
+   /***************************************************************************/
+
+   wire [31:0] amoALU =
+
+     (instr[31:27] == 5'h00 ?         aluPlus          : 32'b0) | // amoadd.w
+     (instr[31:27] == 5'h01 ?                  aluIn2  : 32'b0) | // amoswap.w
+     (instr[31:27] == 5'h04 ?         aluIn1 ^ aluIn2  : 32'b0) | // amoxor.w
+     (instr[31:27] == 5'h08 ?         aluIn1 | aluIn2  : 32'b0) | // amoor.w
+     (instr[31:27] == 5'h0C ?         aluIn1 & aluIn2  : 32'b0) | // amoand.w
+     (instr[31:27] == 5'h10 ? ( LT  ? aluIn1 : aluIn2) : 32'b0) | // amomin.w
+     (instr[31:27] == 5'h14 ? (!LT  ? aluIn1 : aluIn2) : 32'b0) | // amomax.w
+     (instr[31:27] == 5'h18 ? ( LTU ? aluIn1 : aluIn2) : 32'b0) | // amominu.w
+     (instr[31:27] == 5'h1C ? (!LTU ? aluIn1 : aluIn2) : 32'b0) ; // amomaxu.w
+
+   reg [31:0] amo_wdata;
+
+   wire amo_write = state[WRITE_AMO_bit] | state[WAIT_AMO_bit];
+
+   wire isAMOlr = instr[31:27] == 5'h02; // amolr.w
+   wire isAMOsc = instr[31:27] == 5'h03; // amosc.w
+
+   reg [ADDR_WIDTH-1:0] amo_location;
+   reg                  amo_location_unchanged;
+
+   wire reserved_addr = mem_addr[ADDR_WIDTH-1:0] == amo_location;
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+   wire [ADDR_WIDTH-1:0] PCinc   = long_instr ? PCplus4 : PCplus2;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   assign mem_addr =   state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                       fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
+                                         : {PC     [ADDR_WIDTH-1:2], 2'b00}
+                       : isAMO ? rs1[ADDR_WIDTH-1:0] : loadstore_addr;
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // Interrupt logic, CSR registers and opcodes.
+   /***************************************************************************/
+
+   // Remember interrupt requests as they are not checked for every cycle
+   reg  interrupt_request_sticky;
+
+   // Interrupt enable and lock logic
+   wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
+
+   // Processor accepts interrupts in EXECUTE state.
+   wire interrupt_accepted = interrupt & state[EXECUTE_bit];
+
+   // If current interrupt is accepted, there already might be the next one,
+   //  which should not be missed:
+   always @(posedge clk) begin
+     interrupt_request_sticky <=
+         interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
+   end
+
+   // Decoder for mret opcode
+   wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
+
+   // CSRs:
+   reg  [ADDR_WIDTH-1:0] mepc;    // The saved program counter.
+   reg  [ADDR_WIDTH-1:0] mtvec;   // The address of the interrupt handler.
+   reg                   mstatus; // Interrupt enable
+   reg                   mcause;  // Interrupt cause (and lock)
+   reg  [63:0]           cycles;  // Cycle counter
+
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_mstatus = (instr[31:20] == 12'h300);
+   wire sel_mtvec   = (instr[31:20] == 12'h305);
+   wire sel_mepc    = (instr[31:20] == 12'h341);
+   wire sel_mcause  = (instr[31:20] == 12'h342);
+   wire sel_cycles  = (instr[31:20] == 12'hC00);
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+
+   // Read CSRs
+   /* verilator lint_off WIDTH */
+   wire [31:0] CSR_read =
+     (sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
+     (sel_mtvec   ? mtvec                  : 32'b0) |
+     (sel_mepc    ? mepc                   : 32'b0) |
+     (sel_mcause  ? {mcause, 31'b0}        : 32'b0) |
+     (sel_cycles  ? cycles[31:0]           : 32'b0) |
+     (sel_cyclesh ? cycles[63:32]          : 32'b0) ;
+   /* verilator lint_on WIDTH */
+
+   // Write CSRs: 5 bit unsigned immediate or content of RS1
+   wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1;
+
+   wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read  :
+                           (instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
+                        /* (instr[13:12] == 2'b01) ? */  CSR_modifier ;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+	 mstatus <= 0;
+      end else begin
+	 // Execute a CSR opcode
+	 if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
+	    if (sel_mstatus) mstatus <= CSR_write[3];
+	    if (sel_mtvec  ) mtvec   <= CSR_write[ADDR_WIDTH-1:0];
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read  : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCinc     : 32'b0) |  // JAL, JALR
+      (isLoad | isAMO & ~isAMOsc ? LOAD_data      : 32'b0) |  // Load, AMO
+      (isAMO & isAMOsc ? {31'b0, ~amo_location_unchanged} : 32'b0); // AMOsc
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = amo_write ? amo_wdata[ 7: 0] : rs2[7:0];
+   assign mem_wdata[15: 8] = amo_write ? amo_wdata[15: 8] : loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = amo_write ? amo_wdata[23:16] : loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = amo_write ? amo_wdata[31:24] : loadstore_addr[0] ? rs2[7:0]  :
+                                                            loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /***************************************************************************/
+   // Unaligned fetch mechanism and compressed opcode handling
+   /***************************************************************************/
+
+   reg [ADDR_WIDTH-1:2] cached_addr;
+   reg           [31:0] cached_data;
+
+   wire current_cache_hit = cached_addr == PC     [ADDR_WIDTH-1:2];
+   wire    next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
+
+   wire current_unaligned_long = &cached_mem [17:16] & PC    [1];
+   wire    next_unaligned_long = &cached_data[17:16] & PC_new[1];
+
+   reg fetch_second_half;
+   reg long_instr;
+
+   wire [31:0] cached_mem   = current_cache_hit ? cached_data : mem_rdata;
+   wire [31:0] decomp_input = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]}
+                                    : cached_mem;
+   wire [31:0] decompressed;
+
+   decompressor _decomp ( .c(decomp_input), .d(decompressed) );
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit          = 0;
+   localparam WAIT_INSTR_bit           = 1;
+   localparam EXECUTE_bit              = 2;
+   localparam WAIT_ALU_OR_MEM_bit      = 3;
+   localparam WRITE_AMO_bit            = 4;
+   localparam WAIT_AMO_bit             = 5;
+
+   localparam NB_STATES                = 6;
+
+   localparam FETCH_INSTR          = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR           = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE              = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM      = 1 << WAIT_ALU_OR_MEM_bit;
+   localparam WRITE_AMO            = 1 << WRITE_AMO_bit;
+   localparam WAIT_AMO             = 1 << WAIT_AMO_bit;
+
+   reg SkipFetch; // Skip fetch state later
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & (
+            state[EXECUTE_bit] |
+            state[WAIT_ALU_OR_MEM_bit]
+   );
+
+   // The memory-read signal.
+   assign mem_rstrb =    state[EXECUTE_bit] & (isLoad  | isAMO & ~isAMOsc) | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & (isStore | isAMO & isAMOsc & reserved_addr & amo_location_unchanged) | state[WRITE_AMO_bit]}} & STORE_wmask;
+
+   // aluWr starts computation (divide) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+   wire needToWait = isLoad | isStore | isDivide | isAMO;
+
+   wire [ADDR_WIDTH-1:0] PC_new =
+           isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+           jumpToPCplusImm  ? PCplusImm :
+           interrupt_return ? mepc :
+                              PCinc;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state             <= WAIT_ALU_OR_MEM;     //Just waiting for !mem_wbusy
+         PC                <= RESET_ADDR[ADDR_WIDTH-1:0];
+         mcause            <= 0;
+         cached_addr       <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
+         fetch_second_half <= 0;
+         SkipFetch         <= 0;
+         amo_location      <= 0;
+         amo_location_unchanged <= 0;
+      end else begin
+
+         // See note [1] at the end of this file.
+	 (* parallel_case *)
+	 case(1'b1)
+
+           state[WAIT_INSTR_bit]: begin
+              if(!mem_rbusy) begin // may be high when executing from SPI flash
+		 // Update cache
+		 if (~current_cache_hit | fetch_second_half) begin
+                    cached_addr <= mem_addr[ADDR_WIDTH-1:2];
+                    cached_data <= mem_rdata;
+		 end;
+
+		 // Decode instruction
+		 rs1 <= registerFile[decompressed[19:15]];
+		 rs2 <= registerFile[decompressed[24:20]];
+		 instr      <= decompressed[31:2];
+		 long_instr <= &decomp_input[1:0];
+
+		 // Long opcode, unaligned, first part fetched,
+		 // happens in non-linear code
+		 if (current_unaligned_long & ~fetch_second_half) begin
+                    fetch_second_half <= 1;
+                    state <= FETCH_INSTR;
+		 end else begin
+                    fetch_second_half <= 0;
+                    state <= EXECUTE;
+		 end
+              end
+           end
+
+           state[EXECUTE_bit]: begin
+              if (interrupt) begin
+		 PC     <= mtvec;
+                 mepc   <= PC_new;
+                 mcause <= 1;
+                 state  <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+                 SkipFetch <= 0;
+              end else begin
+                 PC <= PC_new;
+                 if (interrupt_return) mcause <= 0;
+
+                 state <= needToWait                            ? WAIT_ALU_OR_MEM :
+                          next_cache_hit & ~next_unaligned_long ? WAIT_INSTR :
+                                                                  FETCH_INSTR;
+                 SkipFetch <= next_cache_hit & ~next_unaligned_long;
+
+                 fetch_second_half <= next_cache_hit & next_unaligned_long;
+              end
+
+              // Watching a reserved memory location
+              if (isAMO & isAMOlr) begin
+                amo_location           <= rs1[ADDR_WIDTH-1:0];
+                amo_location_unchanged <= 1;
+              end else
+              if (isAMO | (isStore & reserved_addr)) begin
+                amo_location_unchanged <= 0;
+              end
+           end
+
+           state[WAIT_ALU_OR_MEM_bit]: begin
+              if(!aluBusy & !mem_rbusy & !mem_wbusy) begin
+                 amo_wdata <= amoALU;
+                 state <= isAMO & ~isAMOlr & ~isAMOsc ? WRITE_AMO   :
+                                            SkipFetch ? WAIT_INSTR  :
+                                                        FETCH_INSTR ;
+              end
+           end
+
+           state[WRITE_AMO_bit]: begin
+              state <= WAIT_AMO;
+           end
+
+           state[WAIT_AMO_bit]: begin
+              if(!mem_wbusy) state <= SkipFetch ? WAIT_INSTR : FETCH_INSTR;
+           end
+
+           default: begin // FETCH_INSTR
+              state <= WAIT_INSTR;
+           end
+	 endcase
+      end
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+
+// if c[15:0] is a compressed instrution, decompresses it in d
+// else copies c to d
+module decompressor(
+   input  wire [31:0] c,
+   output reg  [31:0] d
+);
+
+   // How to handle illegal and unknown opcodes
+
+   localparam illegal = 32'h00000000;
+   localparam unknown = 32'h00000000;
+
+   // Register decoder
+
+   wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
+   wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
+
+   wire [4:0] rwl  = c[ 6:2];  // Register wide low
+   wire [4:0] rwh  = c[11:7];  // Register wide high
+
+   localparam x0 = 5'b00000;
+   localparam x1 = 5'b00001;
+   localparam x2 = 5'b00010;
+
+   // Immediate decoder
+
+   wire  [4:0]    shiftImm = c[6:2];
+
+   wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
+   wire [11:0]     lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
+   wire [11:0]     lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
+   wire [11:0]     swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
+
+   wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
+   wire [11:0]      addImm = {{ 7{c[12]}}, c[6:2]};
+
+   /* verilator lint_off UNUSED */
+   wire [12:0]        bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
+   wire [20:0]      jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
+   wire [31:0]      luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
+   /* verilator lint_on UNUSED */
+
+   always @*
+   casez (c[15:0])
+                                                     // imm / funct7   +   rs2  rs1     fn3                   rd    opcode
+      16'b???___????????_???_11 : d =                                                                            c  ; // Long opcode, no need to decompress
+
+/* verilator lint_off CASEOVERLAP */
+
+      16'b000___00000000_000_00 : d =                                                                       illegal ; // c.illegal   -->  illegal
+      16'b000___????????_???_00 : d = {      addi4spnImm,             x2, 3'b000,                 rcl, 7'b00100_11} ; // c.addi4spn  -->  addi rd', x2, nzuimm[9:2]
+/* verilator lint_on CASEOVERLAP */
+
+      16'b010_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00000_11} ; // c.lw        -->  lw   rd', offset[6:2](rs1')
+      16'b110_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01000_11} ; // c.sw        -->  sw   rs2', offset[6:2](rs1')
+
+      16'b000_???_???_??_???_01 : d = {           addImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi      -->  addi rd, rd, nzimm[5:0]
+      16'b001____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x1, 7'b11011_11} ; // c.jal       -->  jal  x1, offset[11:1]
+      16'b010__?_?????_?????_01 : d = {           addImm,             x0, 3'b000,                 rwh, 7'b00100_11} ; // c.li        -->  addi rd, x0, imm[5:0]
+      16'b011__?_00010_?????_01 : d = {      addi16spImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi16sp  -->  addi x2, x2, nzimm[9:4]
+      16'b011__?_?????_?????_01 : d = {    luiImm[31:12],                                         rwh, 7'b01101_11} ; // c.lui       -->  lui  rd, nzuimm[17:12]
+      16'b100_?_00_???_?????_01 : d = {       7'b0000000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srli      -->  srli rd', rd', shamt[5:0]
+      16'b100_?_01_???_?????_01 : d = {       7'b0100000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srai      -->  srai rd', rd', shamt[5:0]
+      16'b100_?_10_???_?????_01 : d = {           addImm,            rch, 3'b111,                 rch, 7'b00100_11} ; // c.andi      -->  andi rd', rd', imm[5:0]
+      16'b100_011_???_00_???_01 : d = {       7'b0100000,       rcl, rch, 3'b000,                 rch, 7'b01100_11} ; // c.sub       -->  sub  rd', rd', rs2'
+      16'b100_011_???_01_???_01 : d = {       7'b0000000,       rcl, rch, 3'b100,                 rch, 7'b01100_11} ; // c.xor       -->  xor  rd', rd', rs2'
+      16'b100_011_???_10_???_01 : d = {       7'b0000000,       rcl, rch, 3'b110,                 rch, 7'b01100_11} ; // c.or        -->  or   rd', rd', rs2'
+      16'b100_011_???_11_???_01 : d = {       7'b0000000,       rcl, rch, 3'b111,                 rch, 7'b01100_11} ; // c.and       -->  and  rd', rd', rs2'
+      16'b101____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x0, 7'b11011_11} ; // c.j         -->  jal  x0, offset[11:1]
+      16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz      -->  beq  rs1', x0, offset[8:1]
+      16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez      -->  bne  rs1', x0, offset[8:1]
+
+      16'b000__?_?????_?????_10 : d = {        7'b0000000, shiftImm, rwh, 3'b001,                 rwh, 7'b00100_11} ; // c.slli      -->  slli rd, rd, shamt[5:0]
+      16'b010__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00000_11} ; // c.lwsp      -->  lw   rd, offset[7:2](x2)
+      16'b100__0_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x0, 7'b11001_11} ; // c.jr        -->  jalr x0, rs1, 0
+      16'b100__0_?????_?????_10 : d = {        7'b0000000,      rwl,  x0, 3'b000,                 rwh, 7'b01100_11} ; // c.mv        -->  add  rd, x0, rs2
+   // 16'b100__1_00000_00000_10 : d = {                              25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak    -->  ebreak
+      16'b100__1_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x1, 7'b11001_11} ; // c.jalr      -->  jalr x1, rs1, 0
+      16'b100__1_?????_?????_10 : d = {        7'b0000000,      rwl, rwh, 3'b000,                 rwh, 7'b01100_11} ; // c.add       -->  add  rd, rd, rs2
+      16'b110__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01000_11} ; // c.swsp      -->  sw   rs2, offset[7:2](x2)
+
+      default:                    d =                                                                       unknown ; // Unknown opcode
+   endcase
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
@@ -0,0 +1,523 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: The "Intermissum", with full interrupt support.
+//             A single VERILOG file, compact & understandable code.
+//
+// Instruction set: RV32IM + CSR + MRET
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32im"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-O3"
+`define NRV_INTERRUPTS
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         interrupt_request,
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   // The destination register
+   wire [4:0] rdId = instr[11:7];
+
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *)
+   wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except divisions.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr;               // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5], 
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11], 
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17], 
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29], 
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4], 
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9], 
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14], 
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19], 
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24], 
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29], 
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide  = isALUreg & funcM & instr[14]; // |funct3Is[7:4];
+   wire aluBusy   = |quotient_msk; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+   wire signed [63:0] multiply = signed1 * signed2;
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   wire [31:0] aluOut_muldiv =
+     (  funct3Is[0]   ?  multiply[31: 0] : 32'b0) | // 0:MUL
+     ( |funct3Is[3:1] ?  multiply[63:32] : 32'b0) | // 1:MULH, 2:MULHSU, 3:MULHU
+     (  instr[14]     ?  div_sign ? -divResult : divResult : 32'b0) ; 
+                                                 // 4:DIV, 5:DIVU, 6:REM, 7:REMU
+   
+   wire [31:0] aluOut = isALUreg & funcM ? aluOut_muldiv : aluOut_base;
+
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [31:0] quotient_msk;
+
+   wire divstep_do = divisor <= {31'b0, dividend};
+
+   wire [31:0] dividendN     = divstep_do ? dividend - divisor[31:0] : dividend;
+   wire [31:0] quotientN     = divstep_do ? quotient | quotient_msk  : quotient;
+
+   wire div_sign = ~instr[12] & (instr[13] ? aluIn1[31] : 
+                    (aluIn1[31] != aluIn2[31]) & |aluIn2);
+
+   always @(posedge clk) begin
+      if (isDivide & aluWr) begin
+	 dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+	 divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+	 quotient <= 0;
+	 quotient_msk <= 1 << 31;
+      end else begin
+	 dividend     <= dividendN;
+	 divisor      <= divisor >> 1;
+	 quotient     <= quotientN;
+	 quotient_msk <= quotient_msk >> 1;
+      end
+   end
+      
+   reg  [31:0] divResult;
+   always @(posedge clk) divResult <= instr[13] ? dividendN : quotientN;
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   assign mem_addr =   state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                       PC : loadstore_addr ;
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // Interrupt logic, CSR registers and opcodes.
+   /***************************************************************************/
+
+   // Interrupt logic:
+
+   // Remember interrupt requests as they are not checked for every cycle   
+   reg  interrupt_request_sticky;
+   // Interrupt enable and lock logic   
+   wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
+   // Processor accepts interrupts in EXECUTE state.   
+   wire interrupt_accepted = interrupt & state[EXECUTE_bit];        
+
+   // If current interrupt is accepted, there already might be the next one, 
+   // which should not be missed:
+   always @(posedge clk) begin
+     interrupt_request_sticky <= 
+        interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
+   end
+
+   // Decoder for mret opcode
+   wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
+
+   // CSRs:
+   reg  [ADDR_WIDTH-1:0] mepc;    // The saved program counter.
+   reg  [ADDR_WIDTH-1:0] mtvec;   // The address of the interrupt handler.
+   reg                   mstatus; // Interrupt enable
+   reg                   mcause;  // Interrupt cause (and lock)
+   reg  [63:0]           cycles;  // Cycle counter
+
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_mstatus = (instr[31:20] == 12'h300);
+   wire sel_mtvec   = (instr[31:20] == 12'h305);
+   wire sel_mepc    = (instr[31:20] == 12'h341);
+   wire sel_mcause  = (instr[31:20] == 12'h342);
+   wire sel_cycles  = (instr[31:20] == 12'hC00);
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+
+   // Read CSRs:
+   /* verilator lint_off WIDTH */
+   wire [31:0] CSR_read =
+     (sel_mstatus ? {28'b0, mstatus, 3'b0}  : 32'b0) |
+     (sel_mtvec   ? mtvec                   : 32'b0) |
+     (sel_mepc    ? mepc                    : 32'b0) |
+     (sel_mcause  ? {mcause, 31'b0}         : 32'b0) |
+     (sel_cycles  ? cycles[31:0]            : 32'b0) |
+     (sel_cyclesh ? cycles[63:32]           : 32'b0) ;
+   /* verilator lint_on WIDTH */
+
+   // Write CSRs: 5 bit unsigned immediate or content of RS1
+   wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1; 
+
+   wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read  :
+                           (instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
+                        /* (instr[13:12] == 2'b01) ? */  CSR_modifier ;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+	 mstatus <= 0;
+      end else begin
+	 // Execute a CSR opcode
+	 if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
+	    if (sel_mstatus) mstatus <= CSR_write[3];
+	    if (sel_mtvec  ) mtvec   <= CSR_write[ADDR_WIDTH-1:0];
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read  : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4   : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data : 32'b0) ;  // Load
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+                             loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE_bit         = 2;
+   localparam WAIT_ALU_OR_MEM_bit = 3;
+   localparam NB_STATES           = 4;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE         = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) &
+                    (state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+   wire needToWait = isLoad | isStore | isDivide;
+
+   wire [ADDR_WIDTH-1:0] PC_new = 
+			 isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                         jumpToPCplusImm  ? PCplusImm :
+                         interrupt_return ? mepc :
+                         PCplus4;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+         mcause     <= 0;
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE;         // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE_bit]: begin
+           if (interrupt) begin
+              PC     <= mtvec;
+              mepc   <= PC_new;
+              mcause <= 1;
+           end else begin
+              PC <= PC_new;
+              if (interrupt_return) mcause <= 0;
+           end
+           state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+
+      endcase
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,790 @@
+/******************************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: PetitBateau (make it float), RV32IMFC
+// Rounding works as follows:
+// - all subnormals are flushed to zero
+// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
+// - FDIV and FSQRT do not have correct rounding
+//
+// [TODO] add FPU CSR (and instret for perf stat)]
+// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
+// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
+// [TODO] support IEEE754 denormals
+// [TODO] NaNs propagation and infinity
+// [TODO] support all IEEE754 rounding modes
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/******************************************************************************/
+
+`include "petitbateau.v"
+
+// Firmware generation flags for this processor
+//    Note: atomic instructions not supported, but 'a' is set in
+//    compiler flag, because there is no toolchain/libs for
+//    rv32imfc / imf in most risc-V compiler distributions.
+
+`define NRV_ARCH     "rv32imafc" 
+`define NRV_ABI      "ilp32f"
+
+`define NRV_OPTIMIZE "-O3"
+`define NRV_INTERRUPTS
+
+// Check condition and display message in simulation
+`ifdef BENCH
+ `define ASSERT(cond,msg) if(!(cond)) $display msg
+ `define ASSERT_NOT_REACHED(msg) $display msg
+`else
+ `define ASSERT(cond,msg)
+ `define ASSERT_NOT_REACHED(msg)
+`endif
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         interrupt_request,
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   // Flip a 32 bit word. Used by the shifter (a single shifter for
+   // left and right shifts, saves silicium !)
+   function [31:0] flip32;
+      input [31:0] x;
+      flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7], 
+		x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15], 
+		x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+		x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
+   endfunction
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   wire [2:0] funct3 = instr[14:12];
+   
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm   
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+   wire isFPU     =  (instr[6:5] == 2'b10);    // all FPU instr except FLW/FSW
+   
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] rs3; // this one is used by the FMA instructions.
+   
+   reg [31:0] registerFile [63:0]; //  0..31: integer registers
+                                   // 32..63: floating-point registers
+   
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except divisions.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr; // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
+   
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = flip32(shifter);
+   
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide  = isALUreg & funcM & instr[14];
+   wire aluBusy   = |div_cnt; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+
+   wire signed [63:0]  multiply = signed1 * signed2;      
+   
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   reg [31:0]  aluOut_mul;
+   always @(posedge clk) begin
+      aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
+   end
+
+   reg [31:0]  aluOut_div;
+   always @(posedge clk) begin
+      (* parallel_case, full_case *)
+      case(1'b1)
+	 instr[13] &  div_sign: aluOut_div <= -dividend;
+	 instr[13] & !div_sign: aluOut_div <=  dividend;
+	!instr[13] &  div_sign: aluOut_div <= -quotient;
+	!instr[13] & !div_sign: aluOut_div <=  quotient;	
+      endcase
+   end
+
+   reg [31:0] aluOut;
+   always @(*) begin
+      (* parallel_case *)
+      case(1'b1)
+	isALUreg & funcM &  instr[14]: aluOut = aluOut_div;
+	isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
+	default: aluOut = aluOut_base;
+      endcase
+   end
+    
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [5:0]  div_cnt;
+   reg div_sign;
+   
+   always @(posedge clk) begin
+      if (aluWr) begin
+	 div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] : 
+                                  (aluIn1[31] != aluIn2[31]) & |aluIn2);
+         dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+         divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+         quotient <= 0;
+	 div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
+      end else begin
+	 if(aluBusy) div_cnt <= div_cnt - 1;
+      end
+      if(|div_cnt[5:1]) begin 
+         divisor <= divisor >> 1;
+	 if(divisor <= {31'b0, dividend}) begin
+	    quotient <= {quotient[30:0],1'b1};
+	    dividend <= dividend - divisor[31:0];
+	 end else begin
+	    quotient <= {quotient[30:0],1'b0};	    
+	 end
+      end
+   end 
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+
+   wire predicate = funct3Is[0] &  EQ  | // BEQ
+                    funct3Is[1] & !EQ  | // BNE
+                    funct3Is[4] &  LT  | // BLT
+                    funct3Is[5] & !LT  | // BGE
+                    funct3Is[6] &  LTU | // BLTU
+                    funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Registers read-write 
+   /***************************************************************************/
+
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit]) begin
+	 // Fetch registers as soon as instruction is ready.
+	 rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}]; 
+	 rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
+	 rs3 <= registerFile[{1'b1,       raw_instr[31:27]}];
+      end else if(state[DECOMPRESS_GETREGS_bit]) begin
+	 // For compressed instructions, fetch registers once decompressed.
+	 rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
+	 rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
+	 // no need to fetch rs3 here, there is no compressed FMA.
+      end else if(writeBack & !fpuBusy) begin
+	 if(rdIsFP || |instr[11:7]) begin
+            registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The FPU 
+   /***************************************************************************/
+
+   wire fpuBusy;
+   wire [31:0] fpuOut;
+   PetitBateau FPU(
+      .clk(clk),
+      .wr(state[EXECUTE_bit] & isFPU),
+      .instr(instr[31:2]),
+      .rs1(rs1),
+      .rs2(rs2),
+      .rs3(rs3),
+      .busy(fpuBusy),		   
+      .out(fpuOut)		   
+   );
+   
+   // There is a single register bank, registers 0..31 are the integer
+   // registers, and 32..63 are the floating point registers, hence
+   // bit 5 of rs1,rs2,rd index is set to 0 for an integer register
+   // and 1 for a fp register. 
+
+   // asserted if the destination register is a floating-point register
+   wire rdIsFP = (instr[6:2] == 5'b00001)             || // FLW
+	         (instr[6:4] == 3'b100  )             || // F{N}MADD,F{N}MSUB
+	         (instr[6:4] == 3'b101 && (
+                            (instr[31]    == 1'b0)    || // R-Type FPU
+			    (instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+			    (instr[31:28] == 4'b1111)    // FMV.W.X 
+			 )
+                 );
+
+   // rs1 is a FP register if instr[6:5] = 2'b10 except for:
+   //   FCVT.S.W{U}:  instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
+   //   FMV.W.X    :  instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire raw_rs1IsFP = (raw_instr[6:5]   == 2'b10 ) &&  
+                     !((raw_instr[4:2]  == 3'b100) && (
+                      (raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+     	              (raw_instr[31:28] == 4'b1111)    // FMV.W.X
+                    )						    
+		  );
+
+   wire decomp_rs1IsFP = (instr[6:5]   == 2'b10 ) &&  
+                     !((instr[4:2]  == 3'b100) && (
+                      (instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+     	              (instr[31:28] == 4'b1111)    // FMV.W.X
+                    )						    
+		  );
+   
+   // rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
+   wire decomp_rs2IsFP =  (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);   
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+   wire [ADDR_WIDTH-1:0] PCinc   = long_instr ? PCplus4 : PCplus2;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   assign mem_addr =   state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                       fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
+                                         : {PC     [ADDR_WIDTH-1:2], 2'b00}
+                       : loadstore_addr  ;
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // Interrupt logic, CSR registers and opcodes.
+   /***************************************************************************/
+
+   // Remember interrupt requests as they are not checked for every cycle
+   reg  interrupt_request_sticky;
+   
+   // Interrupt enable and lock logic
+   wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
+
+   // Processor accepts interrupts in EXECUTE state.   
+   wire interrupt_accepted = interrupt & state[EXECUTE_bit];        
+
+   // If current interrupt is accepted, there already might be the next one,
+   //  which should not be missed:
+   always @(posedge clk) begin
+     interrupt_request_sticky <= 
+         interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
+   end
+
+   // Decoder for mret opcode
+   wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
+
+   // CSRs:
+   reg  [ADDR_WIDTH-1:0] mepc;    // The saved program counter.
+   reg  [ADDR_WIDTH-1:0] mtvec;   // The address of the interrupt handler.
+   reg                   mstatus; // Interrupt enable
+   reg                   mcause;  // Interrupt cause (and lock)
+   reg  [63:0]           cycles;  // Cycle counter
+
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_mstatus = (instr[31:20] == 12'h300);
+   wire sel_mtvec   = (instr[31:20] == 12'h305);
+   wire sel_mepc    = (instr[31:20] == 12'h341);
+   wire sel_mcause  = (instr[31:20] == 12'h342);
+   wire sel_cycles  = (instr[31:20] == 12'hC00);
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+
+   // Read CSRs
+   /* verilator lint_off WIDTH */   
+   wire [31:0] CSR_read =
+     (sel_mstatus ? {28'b0, mstatus, 3'b0} : 32'b0) |
+     (sel_mtvec   ? mtvec                  : 32'b0) |
+     (sel_mepc    ? mepc                   : 32'b0) |
+     (sel_mcause  ? {mcause, 31'b0}        : 32'b0) |
+     (sel_cycles  ? cycles[31:0]           : 32'b0) |
+     (sel_cyclesh ? cycles[63:32]          : 32'b0) ;
+   /* verilator lint_on WIDTH */
+
+   // Write CSRs: 5 bit unsigned immediate or content of RS1
+   wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1; 
+
+   wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read  :
+                           (instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
+                        /* (instr[13:12] == 2'b01) ? */  CSR_modifier ;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+	 mstatus <= 0;
+      end else begin
+	 // Execute a CSR opcode
+	 if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
+	    if (sel_mstatus) mstatus <= CSR_write[3];
+	    if (sel_mtvec  ) mtvec   <= CSR_write[ADDR_WIDTH-1:0];
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read  : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isFPU               ? fpuOut    : 32'b0) |  // FPU
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCinc     : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data : 32'b0) ;  // Load
+   /* verilator lint_on WIDTH */
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   // TODO: support unaligned accesses for FLW and FSW 
+   
+   // instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
+   wire mem_byteAccess     = !instr[2] && (instr[13:12] == 2'b00); 
+   wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01); 
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+                             loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /***************************************************************************/
+   // Unaligned fetch mechanism and compressed opcode handling
+   /***************************************************************************/
+
+   reg [ADDR_WIDTH-1:2] cached_addr;
+   reg           [31:0] cached_data;
+
+   wire current_cache_hit = cached_addr == PC     [ADDR_WIDTH-1:2];
+   wire    next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
+
+   wire current_unaligned_long = &cached_mem [17:16] & PC    [1];
+   wire    next_unaligned_long = &cached_data[17:16] & PC_new[1];
+
+   reg fetch_second_half;
+   reg long_instr;
+
+   wire [31:0] cached_mem   = current_cache_hit ? cached_data : mem_rdata;
+   wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]} 
+                                    : cached_mem;
+   wire [31:0] decompressed;
+   decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
+   
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit          = 0;
+   localparam WAIT_INSTR_bit           = 1;
+   localparam DECOMPRESS_GETREGS_bit   = 2;   
+   localparam EXECUTE_bit              = 3;
+   localparam WAIT_ALU_OR_MEM_bit      = 4;
+   localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
+
+   localparam NB_STATES                = 6;
+
+   localparam FETCH_INSTR          = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR           = 1 << WAIT_INSTR_bit;
+   localparam DECOMPRESS_GETREGS   = 1 << DECOMPRESS_GETREGS_bit;   
+   localparam EXECUTE              = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM      = 1 << WAIT_ALU_OR_MEM_bit;
+   localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
+            state[EXECUTE_bit] | 
+	    state[WAIT_ALU_OR_MEM_bit] | 
+            state[WAIT_ALU_OR_MEM_SKIP_bit]
+   );
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (divide) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+`ifdef NRV_IS_IO_ADDR
+   wire needToWait = isLoad | 
+                    (isStore & `NRV_IS_IO_ADDR(mem_addr)) | 
+                     isALUreg & funcM  /* isDivide */ | 
+                     isFPU;  
+`else
+   wire needToWait = isLoad  | 
+                     isStore | 
+                     isALUreg & funcM  /* isDivide */ | 
+                     isFPU;  
+`endif
+
+   wire [ADDR_WIDTH-1:0] PC_new = 
+           isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+           jumpToPCplusImm  ? PCplusImm :
+           interrupt_return ? mepc :
+                              PCinc;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state             <= WAIT_ALU_OR_MEM;     //Just waiting for !mem_wbusy
+         PC                <= RESET_ADDR[ADDR_WIDTH-1:0];
+         mcause            <= 0;
+         cached_addr       <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
+         fetch_second_half <= 0;
+      end else begin
+
+	 // See note [1] at the end of this file.
+	 (* parallel_case *)
+	 case(1'b1)
+
+           state[WAIT_INSTR_bit]: begin
+              if(!mem_rbusy) begin // may be high when executing from SPI flash
+		 // Update cache
+		 if (~current_cache_hit | fetch_second_half) begin
+                    cached_addr <= mem_addr[ADDR_WIDTH-1:2];
+                    cached_data <= mem_rdata;
+		 end;
+
+		 // Decode instruction
+		 // Registers are fetched at the same time, in the
+		 // FPU's always block.
+		 instr  <= &raw_instr[1:0] ? raw_instr[31:2] 
+                                           : decompressed[31:2];
+		 long_instr <= &raw_instr[1:0];
+
+		 // Long opcode, unaligned, first part fetched, 
+		 // happens in non-linear code
+		 if (current_unaligned_long & ~fetch_second_half) begin
+                    fetch_second_half <= 1;
+                    state <= FETCH_INSTR;
+		 end else begin
+                    fetch_second_half <= 0;
+                    state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
+		 end
+              end
+           end
+
+           state[DECOMPRESS_GETREGS_bit]: begin
+	      // All the registers are fetched in FPU's always block.
+	      state <= EXECUTE;
+	   end
+	   
+           state[EXECUTE_bit]: begin
+              if (interrupt) begin
+		 PC     <= mtvec;
+		 mepc   <= PC_new;
+		 mcause <= 1;
+		 state  <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+              end else begin
+		 // Unaligned load/store not implemented yet
+		 // (the norm supposes that FLW and FSW can handle them)
+		 `ASSERT(
+                     !((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]), 
+		     ("PC=%x UNALIGNED FLW/FSW",PC)
+                 );
+		 
+		 PC <= PC_new;
+		 if (interrupt_return) mcause <= 0;
+
+		 state <= next_cache_hit & ~next_unaligned_long
+  		        ? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
+			: (needToWait ? WAIT_ALU_OR_MEM      : FETCH_INSTR);
+
+		 fetch_second_half <= next_cache_hit & next_unaligned_long;
+              end
+           end
+
+           state[WAIT_ALU_OR_MEM_bit]: begin
+              if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
+                 state <= FETCH_INSTR;
+	      end
+           end
+
+           state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
+              if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
+                 state <= WAIT_INSTR;
+	      end
+           end
+
+           default: begin // FETCH_INSTR
+              state <= WAIT_INSTR;
+           end
+	 endcase 
+      end
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+
+module decompressor(
+   input  wire [15:0] c,
+   output reg  [31:0] d
+);
+
+   // Notes: * replaced illegal, unknown, x0, x1, x2 with
+   //   'localparam' instead of 'wire='
+   //        * could split decoding into multiple cycles
+   //   if decompressor is a bottleneck
+   
+   // How to handle illegal and unknown opcodes
+   localparam illegal = 32'h0;
+   localparam unknown = 32'h0;
+
+   // Register decoder
+
+   wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
+   wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
+
+   wire [4:0] rwl  = c[ 6:2];  // Register wide low
+   wire [4:0] rwh  = c[11:7];  // Register wide high
+
+   localparam x0 = 5'b00000;
+   localparam x1 = 5'b00001;
+   localparam x2 = 5'b00010;   
+   
+   // Immediate decoder
+
+   wire  [4:0]    shiftImm = c[6:2];
+
+   wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
+   wire [11:0]     lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
+   wire [11:0]     lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
+   wire [11:0]     swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
+
+   wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
+   wire [11:0]      addImm = {{ 7{c[12]}}, c[6:2]};
+
+   /* verilator lint_off UNUSED */
+   wire [12:0]        bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
+   wire [20:0]      jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
+   wire [31:0]      luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
+   /* verilator lint_on UNUSED */
+
+   always @*
+   casez (c[15:0])
+                                                     // imm / funct7   +   rs2  rs1     fn3                   rd    opcode
+//    16'b???___????????_???_11 : d =                                                                            c  ; // Long opcode, no need to decompress
+
+/* verilator lint_off CASEOVERLAP */   
+      16'b000___00000000_000_00 : d =                                                                       illegal ; // c.illegal   -->  illegal
+      16'b000___????????_???_00 : d = {      addi4spnImm,             x2, 3'b000,                 rcl, 7'b00100_11} ; // c.addi4spn  -->  addi rd', x2, nzuimm[9:2]
+/* verilator lint_on CASEOVERLAP */
+     
+      16'b010_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00000_11} ; // c.lw        -->  lw   rd', offset[6:2](rs1')
+      16'b110_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01000_11} ; // c.sw        -->  sw   rs2', offset[6:2](rs1')
+
+      
+      16'b000_???_???_??_???_01 : d = {           addImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi      -->  addi rd, rd, nzimm[5:0]
+      16'b001____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x1, 7'b11011_11} ; // c.jal       -->  jal  x1, offset[11:1]
+      16'b010__?_?????_?????_01 : d = {           addImm,             x0, 3'b000,                 rwh, 7'b00100_11} ; // c.li        -->  addi rd, x0, imm[5:0]
+      16'b011__?_00010_?????_01 : d = {      addi16spImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi16sp  -->  addi x2, x2, nzimm[9:4]
+      16'b011__?_?????_?????_01 : d = {    luiImm[31:12],                                         rwh, 7'b01101_11} ; // c.lui       -->  lui  rd, nzuimm[17:12]
+      16'b100_?_00_???_?????_01 : d = {       7'b0000000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srli      -->  srli rd', rd', shamt[5:0]
+      16'b100_?_01_???_?????_01 : d = {       7'b0100000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srai      -->  srai rd', rd', shamt[5:0]
+      16'b100_?_10_???_?????_01 : d = {           addImm,            rch, 3'b111,                 rch, 7'b00100_11} ; // c.andi      -->  andi rd', rd', imm[5:0]
+      16'b100_011_???_00_???_01 : d = {       7'b0100000,       rcl, rch, 3'b000,                 rch, 7'b01100_11} ; // c.sub       -->  sub  rd', rd', rs2'
+      16'b100_011_???_01_???_01 : d = {       7'b0000000,       rcl, rch, 3'b100,                 rch, 7'b01100_11} ; // c.xor       -->  xor  rd', rd', rs2'
+      16'b100_011_???_10_???_01 : d = {       7'b0000000,       rcl, rch, 3'b110,                 rch, 7'b01100_11} ; // c.or        -->  or   rd', rd', rs2'
+      16'b100_011_???_11_???_01 : d = {       7'b0000000,       rcl, rch, 3'b111,                 rch, 7'b01100_11} ; // c.and       -->  and  rd', rd', rs2'
+      16'b101____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x0, 7'b11011_11} ; // c.j         -->  jal  x0, offset[11:1]
+      16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz      -->  beq  rs1', x0, offset[8:1]
+      16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez      -->  bne  rs1', x0, offset[8:1]
+
+      16'b000__?_?????_?????_10 : d = {        7'b0000000, shiftImm, rwh, 3'b001,                 rwh, 7'b00100_11} ; // c.slli      -->  slli rd, rd, shamt[5:0]
+      16'b010__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00000_11} ; // c.lwsp      -->  lw   rd, offset[7:2](x2)
+      16'b100__0_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x0, 7'b11001_11} ; // c.jr        -->  jalr x0, rs1, 0
+      16'b100__0_?????_?????_10 : d = {        7'b0000000,      rwl,  x0, 3'b000,                 rwh, 7'b01100_11} ; // c.mv        -->  add  rd, x0, rs2
+   // 16'b100__1_00000_00000_10 : d = {                              25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak    -->  ebreak
+      16'b100__1_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x1, 7'b11001_11} ; // c.jalr      -->  jalr x1, rs1, 0
+      16'b100__1_?????_?????_10 : d = {        7'b0000000,      rwl, rwh, 3'b000,                 rwh, 7'b01100_11} ; // c.add       -->  add  rd, rd, rs2
+      16'b110__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01000_11} ; // c.swsp      -->  sw   rs2, offset[7:2](x2)
+
+      // Four compressed RV32F load/store instructions
+      16'b011_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00001_11} ; // c.flw       -->  flw   rd', offset[6:2](rs1')
+      16'b111_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01001_11} ; // c.fsw       -->  fsw   rs2', offset[6:2](rs1')
+      16'b011__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00001_11} ; // c.flwsp     -->  flw   rd, offset[7:2](x2)
+      16'b111__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01001_11} ; // c.fswsp     -->  fsw   rs2, offset[7:2](x2)
+      
+
+//      default:                    d =                                                                       unknown ; // Unknown opcode
+     default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
+   endcase
+endmodule
+
+/*****************************************************************************/
@@ -0,0 +1,420 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+// This version: The "Quark", the most elementary version of FemtoRV32.
+//             A single VERILOG file, compact & understandable code.
+//             (200 lines of code, 400 lines counting comments)
+//
+// Instruction set: RV32I + RDCYCLES
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Macros:
+//    optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
+//              evaluate to 1 if addr is in mapped IO space,
+//              evaluate to 0 otherwise
+//    (additional wait states are used when in IO space).
+//    If left undefined, wait states are always used.
+//
+//    NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
+//    by the ticks counter. If not defined, a 32-bits counter is generated.
+//    (reducing its width may be useful for space-constrained designs).
+//
+//    NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
+//    (uses a two-level shifter inspired by picorv32).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32i"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-Os"
+
+module FemtoRV32(
+   input 	 clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output [3:0]  mem_wmask, // write mask for the 4 bytes of each word
+   input [31:0]  mem_rdata, // input lines for both data and instr
+   output 	 mem_rstrb, // active to initiate memory read (used by IO)
+   input 	 mem_rbusy, // asserted if memory is busy reading value
+   input 	 mem_wbusy, // asserted if memory is busy writing value
+
+   input 	 reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The destination register
+ wire [4:0] rdId = instr[11:7];
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *)
+ wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+ // The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
+ wire [31:0] Uimm = {    instr[31],   instr[30:12], {12{1'b0}}};
+ wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
+ /* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
+ wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
+ wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+ wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+ /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- cycles
+   wire isJAL     =  instr[3]; // (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   
+   (* no_rw_check *)
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except shifts.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   reg  [31:0] aluReg;       // The internal register of the ALU, used by shift.
+   reg  [4:0]  aluShamt;     // Current shift amount.
+
+   wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
+   wire aluWr;               // ALU write strobe, starts shifting.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) |
+     (funct3IsShift ? aluReg                                         : 32'b0) ;
+
+   wire funct3IsShift = funct3Is[1] | funct3Is[5];
+
+   always @(posedge clk) begin
+      if(aluWr) begin
+         if (funct3IsShift) begin  // SLL, SRA, SRL
+	    aluReg <= aluIn1;
+	    aluShamt <= aluIn2[4:0];
+	 end
+      end
+
+`ifdef NRV_TWOLEVEL_SHIFTER
+      else if(|aluShamt[4:2]) begin // Shift by 4
+         aluShamt <= aluShamt - 4;
+	 aluReg <= funct3Is[1] ? aluReg << 4 :
+		   {{4{instr[30] & aluReg[31]}}, aluReg[31:4]};
+      end  else
+`endif
+      // Compact form of:
+      // funct3=001              -> SLL  (aluReg <= aluReg << 1)
+      // funct3=101 &  instr[30] -> SRA  (aluReg <= {aluReg[31], aluReg[31:1]})
+      // funct3=101 & !instr[30] -> SRL  (aluReg <= {1'b0,       aluReg[31:1]})
+
+      if (|aluShamt) begin
+         aluShamt <= aluShamt - 1;
+	 aluReg <= funct3Is[1] ? aluReg << 1 :              // SLL
+		   {instr[30] & aluReg[31], aluReg[31:1]};  // SRA,SRL
+      end
+   end
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+					    instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+					               Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+		   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   // internal address registers and cycles counter may have less than 
+   // 32 bits, so we deactivate width test for mem_addr and writeBackData
+
+   assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+		     PC : loadstore_addr ;
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? cycles     : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm       : 32'b0) |  // LUI
+      (isALU               ? aluOut     : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm  : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4    : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data  : 32'b0) ;  // Load
+      
+   /* verilator lint_on WIDTH */
+
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ?
+	            (loadstore_addr[1] ?
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+	      mem_halfwordAccess ?
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE_bit         = 2;
+   localparam WAIT_ALU_OR_MEM_bit = 3;
+   localparam NB_STATES           = 4;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE         = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) &
+	            (state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR
+   wire needToWait = isLoad |
+		     isStore  & `NRV_IS_IO_ADDR(mem_addr) |
+		     isALU & funct3IsShift;
+`else
+   wire needToWait = isLoad | isStore | isALU & funct3IsShift;
+`endif
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE;         // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE_bit]: begin
+           PC <= isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                 jumpToPCplusImm ? PCplusImm :
+                 PCplus4;
+	   state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;
+`else
+   reg [31:0]  cycles;
+`endif
+   always @(posedge clk) cycles <= cycles + 1;
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      aluShamt = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,409 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+// This version: The "Quark", the most elementary version of FemtoRV32.
+//             A single VERILOG file, compact & understandable code.
+//             (200 lines of code, 400 lines counting comments)
+//
+// Instruction set: RV32I + RDCYCLES
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Macros:
+//    optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
+//              evaluate to 1 if addr is in mapped IO space,
+//              evaluate to 0 otherwise
+//    (additional wait states are used when in IO space).
+//    If left undefined, wait states are always used.
+//
+//    NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
+//    by the ticks counter. If not defined, a 32-bits counter is generated.
+//    (reducing its width may be useful for space-constrained designs).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32i"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-Os"
+
+module FemtoRV32(
+   input         clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction.
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The destination register
+ wire [4:0] rdId = instr[11:7];
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *)
+ wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+ // The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
+ wire [31:0] Uimm = {    instr[31],   instr[30:12], {12{1'b0}}};
+ wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
+ /* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder.
+ wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
+ wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+ wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+ /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- cycles
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except shifts.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ?
+     {aluIn1[ 0], aluIn1[ 1], aluIn1[ 2], aluIn1[ 3], aluIn1[ 4], aluIn1[ 5],
+      aluIn1[ 6], aluIn1[ 7], aluIn1[ 8], aluIn1[ 9], aluIn1[10], aluIn1[11],
+      aluIn1[12], aluIn1[13], aluIn1[14], aluIn1[15], aluIn1[16], aluIn1[17],
+      aluIn1[18], aluIn1[19], aluIn1[20], aluIn1[21], aluIn1[22], aluIn1[23],
+      aluIn1[24], aluIn1[25], aluIn1[26], aluIn1[27], aluIn1[28], aluIn1[29],
+      aluIn1[30], aluIn1[31]} : aluIn1;
+
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter =
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = {
+     shifter[ 0], shifter[ 1], shifter[ 2], shifter[ 3], shifter[ 4],
+     shifter[ 5], shifter[ 6], shifter[ 7], shifter[ 8], shifter[ 9],
+     shifter[10], shifter[11], shifter[12], shifter[13], shifter[14],
+     shifter[15], shifter[16], shifter[17], shifter[18], shifter[19],
+     shifter[20], shifter[21], shifter[22], shifter[23], shifter[24],
+     shifter[25], shifter[26], shifter[27], shifter[28], shifter[29],
+     shifter[30], shifter[31]};
+
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   /* verilator lint_off WIDTH */
+   // internal address registers and cycles counter may have less than
+   // 32 bits, so we deactivate width test for mem_addr and writeBackData
+
+   wire [ADDR_WIDTH-1:0] PC_new =
+      isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+      jumpToPCplusImm ? PCplusImm                      :
+                        PCplus4;
+
+   assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? PC     :
+                     state[EXECUTE_bit] & ~isLoad & ~isStore        ? PC_new :
+                                                              loadstore_addr ;
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? cycles     : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm       : 32'b0) |  // LUI
+      (isALU               ? aluOut     : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm  : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4    : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data  : 32'b0) ;  // Load
+
+   /* verilator lint_on WIDTH */
+
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ?
+	            (loadstore_addr[1] ?
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+	      mem_halfwordAccess ?
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE_bit         = 2;
+   localparam WAIT_ALU_OR_MEM_bit = 3;
+   localparam NB_STATES           = 4;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE         = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) &
+                    (state[EXECUTE_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & ~isStore | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR
+   wire needToWait = isLoad |
+                     isStore  & `NRV_IS_IO_ADDR(mem_addr) ;
+`else
+   wire needToWait = isLoad | isStore ;
+`endif
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE;         // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE_bit]: begin
+           PC <= PC_new;
+           state <= needToWait ? WAIT_ALU_OR_MEM : WAIT_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;
+`else
+   reg [31:0]  cycles;
+`endif
+   always @(posedge clk) cycles <= cycles + 1;
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed.
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this:
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,421 @@
+/*******************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+// This version: The "Tachyon". It works like the "Quark", with the
+//  difference that EXECUTE is split into two steps. This allows
+//  higher maxfreq.
+//
+// Instruction set: RV32I + RDCYCLES
+//
+// Parameters:
+//  Reset address can be defined using RESET_ADDR (default is 0).
+//
+//  The ADDR_WIDTH parameter lets you define the width of the internal
+//  address bus (and address computation logic).
+//
+// Macros:
+//    optionally one may define NRV_IS_IO_ADDR(addr), that is supposed to:
+//              evaluate to 1 if addr is in mapped IO space, 
+//              evaluate to 0 otherwise
+//    (additional wait states are used when in IO space).
+//    If left undefined, wait states are always used.
+//
+//    NRV_COUNTER_WIDTH may be defined to reduce the number of bits used
+//    by the ticks counter. If not defined, a 32-bits counter is generated.
+//    (reducing its width may be useful for space-constrained designs).
+//
+//    NRV_TWOLEVEL_SHIFTER may be defined to make shift operations faster
+//    (uses a two-level shifter inspired by picorv32).
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/*******************************************************************/
+
+// Firmware generation flags for this processor
+`define NRV_ARCH     "rv32i"
+`define NRV_ABI      "ilp32"
+`define NRV_OPTIMIZE "-Os"
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output [3:0]  mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   parameter RESET_ADDR       = 32'h00000000; 
+   parameter ADDR_WIDTH       = 24;           
+
+ /***************************************************************************/
+ // Instruction decoding.
+ /***************************************************************************/
+
+ // Extracts rd,rs1,rs2,funct3,imm and opcode from instruction. 
+ // Reference: Table page 104 of:
+ // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+ // The destination register
+ wire [4:0] rdId = instr[11:7];
+
+ // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+ // It is used as follows: funct3Is[val] <=> funct3 == val
+ (* onehot *)
+ wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+ // The five immediate formats, see RiscV reference (link above), Fig. 2.4 p. 12
+ wire [31:0] Uimm = {    instr[31],   instr[30:12], {12{1'b0}}};
+ wire [31:0] Iimm = {{21{instr[31]}}, instr[30:20]};
+ /* verilator lint_off UNUSED */ // MSBs of SBJimms are not used by addr adder. 
+ wire [31:0] Simm = {{21{instr[31]}}, instr[30:25],instr[11:7]};
+ wire [31:0] Bimm = {{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+ wire [31:0] Jimm = {{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+ /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:2] == 5'b00000); // rd <- mem[rs1+Iimm]
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:2] == 5'b01000); // mem[rs1+Simm] <- rs2
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- cycles
+
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+   
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] registerFile [31:0];
+
+   always @(posedge clk) begin
+     if (writeBack)
+       if (rdId != 0)
+         registerFile[rdId] <= writeBackData;
+   end
+
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except shifts.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   reg  [31:0] aluReg;       // The internal register of the ALU, used by shift.
+   reg  [4:0]  aluShamt;     // Current shift amount.
+
+   wire aluBusy = |aluShamt; // ALU is busy if shift amount is non-zero.
+   wire aluWr;               // ALU write strobe, starts shifting.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+   
+   wire [31:0] aluOut = aluReg;
+
+   wire funct3IsShift = funct3Is[1] | funct3Is[5];
+
+   always @(posedge clk) begin
+      if(aluWr) begin
+	 aluShamt <= funct3IsShift ? aluIn2[4:0] : 5'b0;
+	 aluReg <=
+	 (funct3IsShift ? aluIn1 : 32'b0                                        ) |
+	 (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) | 
+	 (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) | 
+         (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) | 
+         (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) | 
+         (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) | 
+	 (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+      end 
+
+`ifdef NRV_TWOLEVEL_SHIFTER
+      else if(|aluShamt[3:2]) begin // Shift by 4
+         aluShamt <= aluShamt - 4;
+	 aluReg <= funct3Is[1] ? aluReg << 4 : 
+		   {{4{instr[30] & aluReg[31]}}, aluReg[31:4]};	    
+      end  else
+`endif
+      // Compact form of:
+      // funct3=001              -> SLL  (aluReg <= aluReg << 1)      
+      // funct3=101 &  instr[30] -> SRA  (aluReg <= {aluReg[31], aluReg[31:1]})
+      // funct3=101 & !instr[30] -> SRL  (aluReg <= {1'b0,       aluReg[31:1]})
+
+      if (|aluShamt) begin
+         aluShamt <= aluShamt - 1;
+	 aluReg <= funct3Is[1] ? aluReg << 1 :              // SLL
+		   {instr[30] & aluReg[31], aluReg[31:1]};  // SRA,SRL
+      end
+   end
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+   /***************************************************************************/
+
+   wire predicate_ =
+        funct3Is[0] &  EQ  | // BEQ
+        funct3Is[1] & !EQ  | // BNE
+        funct3Is[4] &  LT  | // BLT
+        funct3Is[5] & !LT  | // BGE
+        funct3Is[6] &  LTU | // BLTU
+        funct3Is[7] & !LTU ; // BGEU
+
+   reg 	predicate;
+   
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   reg [ADDR_WIDTH-1:0]  PCplusImm;
+
+   // A separate adder to compute the destination of load/store.   
+   reg [ADDR_WIDTH-1:0]  loadstore_addr;
+   
+   /* verilator lint_off WIDTH */   
+   // internal address registers and cycles counter may have less than 
+   // 32 bits, so we deactivate width test for mem_addr and writeBackData
+   
+   assign mem_addr = state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ? 
+		     PC : loadstore_addr ;
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? cycles    : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm      : 32'b0) |  // LUI
+      (isALU               ? aluOut    : 32'b0) |  // ALUreg, ALUimm
+      (isAUIPC             ? PCplusImm : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? PCplus4   : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data : 32'b0) ;  // Load
+
+   /* verilator lint_on WIDTH */	       	       	       
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   wire mem_byteAccess     = instr[13:12] == 2'b00; // funct3[1:0] == 2'b00;
+   wire mem_halfwordAccess = instr[13:12] == 2'b01; // funct3[1:0] == 2'b01;
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign = 
+	!instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword = 
+	       loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+   
+   wire  [7:0] LOAD_byte = 
+	       loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  : 
+			     loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword 
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte     
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+	      mem_byteAccess      ? 
+	            (loadstore_addr[1] ? 
+		          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+		          (loadstore_addr[0] ? 4'b0010 : 4'b0001) 
+                    ) :
+	      mem_halfwordAccess ? 
+	            (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit     = 0;
+   localparam WAIT_INSTR_bit      = 1;
+   localparam EXECUTE1_bit        = 2;
+   localparam EXECUTE2_bit        = 3;   
+   localparam WAIT_ALU_OR_MEM_bit = 4;
+   localparam NB_STATES           = 5;
+
+   localparam FETCH_INSTR     = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR      = 1 << WAIT_INSTR_bit;
+   localparam EXECUTE1        = 1 << EXECUTE1_bit;
+   localparam EXECUTE2        = 1 << EXECUTE2_bit;   
+   localparam WAIT_ALU_OR_MEM = 1 << WAIT_ALU_OR_MEM_bit;
+   
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & 
+	            (state[EXECUTE2_bit] | state[WAIT_ALU_OR_MEM_bit]);
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE2_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE2_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (shifts) in the ALU.
+   assign aluWr = state[EXECUTE1_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+`ifdef NRV_IS_IO_ADDR  
+   wire needToWait = isLoad | 
+		     isStore  & `NRV_IS_IO_ADDR(mem_addr) | 
+		     aluBusy;
+`else
+   wire needToWait = isLoad | isStore | aluBusy;   
+`endif
+   
+   always @(posedge clk) begin
+      if(!reset) begin
+         state      <= WAIT_ALU_OR_MEM; // Just waiting for !mem_wbusy
+         PC         <= RESET_ADDR[ADDR_WIDTH-1:0];
+      end else
+
+      // See note [1] at the end of this file.
+      (* parallel_case *)
+      case(1'b1)
+
+        state[WAIT_INSTR_bit]: begin
+           if(!mem_rbusy) begin // may be high when executing from SPI flash
+              rs1 <= registerFile[mem_rdata[19:15]];
+              rs2 <= registerFile[mem_rdata[24:20]];
+              instr <= mem_rdata[31:2]; // Bits 0 and 1 are ignored (see
+              state <= EXECUTE1;        // also the declaration of instr).
+           end
+        end
+
+        state[EXECUTE1_bit]: begin
+	   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+	   // Equivalent to:
+	   //  PCplusImm <= PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+	   PCplusImm <= PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] : 
+			       instr[4] ? Uimm[ADDR_WIDTH-1:0] : 
+			                  Bimm[ADDR_WIDTH-1:0] );
+
+	   // testing instr[5] is equivalent to testing isStore in this context.
+	   loadstore_addr <= rs1[ADDR_WIDTH-1:0] + 
+ 		     (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+	   
+	   predicate <= predicate_;
+	   state <= EXECUTE2;
+	end
+	
+        state[EXECUTE2_bit]: begin
+           PC <= isJALR          ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+                 jumpToPCplusImm ? PCplusImm :
+                 PCplus4;
+	   state <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+        end
+
+        state[WAIT_ALU_OR_MEM_bit]: begin
+           if(!aluBusy & !mem_rbusy & !mem_wbusy) state <= FETCH_INSTR;
+        end
+
+        default: begin // FETCH_INSTR
+          state <= WAIT_INSTR;
+        end
+	
+      endcase
+   end
+
+   /***************************************************************************/
+   // Cycle counter
+   /***************************************************************************/
+
+`ifdef NRV_COUNTER_WIDTH
+   reg [`NRV_COUNTER_WIDTH-1:0]  cycles;   
+`else   
+   reg [31:0]  cycles;
+`endif   
+   always @(posedge clk) cycles <= cycles + 1;
+
+endmodule
+
+/*****************************************************************************/
+// Notes:
+//
+// [1] About the "reverse case" statement, also used in Claire Wolf's picorv32:
+// It is just a cleaner way of writing a series of cascaded if() statements,
+// To understand it, think about the case statement *in general* as follows:
+// case (expr)
+//       val_1: statement_1
+//       val_2: statement_2
+//   ... val_n: statement_n
+// endcase
+// The first statement_i such that expr == val_i is executed. 
+// Now if expr is 1'b1:
+// case (1'b1)
+//       cond_1: statement_1
+//       cond_2: statement_2
+//   ... cond_n: statement_n
+// endcase
+// It is *exactly the same thing*, the first statement_i such that
+// expr == cond_i is executed (that is, such that 1'b1 == cond_i,
+// in other words, such that cond_i is true)
+// More on this: 
+//     https://stackoverflow.com/questions/15418636/case-statement-in-verilog
+//
+// [2] state uses 1-hot encoding (at any time, state has only one bit set to 1).
+// It uses a larger number of bits (one bit per state), but often results in
+// a both more compact (fewer LUTs) and faster state machine.
+
@@ -0,0 +1,782 @@
+/******************************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// This version: PetitBateau (make it float), RV32IMFC
+// Rounding works as follows:
+// - all subnormals are flushed to zero
+// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
+// - FDIV and FSQRT do not have correct rounding
+//
+// [TODO] add FPU CSR (and instret for perf stat)]
+// [TODO] FSW/FLW unaligned (does not seem to occur, but the norm requires it)
+// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
+// [TODO] support IEEE754 denormals
+// [TODO] NaNs propagation and infinity
+// [TODO] support all IEEE754 rounding modes
+//
+// Bruno Levy, Matthias Koch, 2020-2021
+/******************************************************************************/
+
+`include "petitbateau.v"
+
+// Firmware generation flags for this processor
+//    Note: atomic instructions not supported, but 'a' is set in
+//    compiler flag, because there is no toolchain/libs for
+//    rv32imfc / imf in most risc-V compiler distributions.
+
+`define NRV_ARCH     "rv32imafc" 
+`define NRV_ABI      "ilp32f"
+
+`define NRV_OPTIMIZE "-O0"
+`define NRV_INTERRUPTS
+
+// Check condition and display message in simulation
+`ifdef BENCH
+ `define ASSERT(cond,msg) if(!(cond)) $display msg
+ `define ASSERT_NOT_REACHED(msg) $display msg
+`else
+ `define ASSERT(cond,msg)
+ `define ASSERT_NOT_REACHED(msg)
+`endif
+
+module FemtoRV32(
+   input          clk,
+
+   output [31:0] mem_addr,  // address bus
+   output [31:0] mem_wdata, // data to be written
+   output  [3:0] mem_wmask, // write mask for the 4 bytes of each word
+   input  [31:0] mem_rdata, // input lines for both data and instr
+   output        mem_rstrb, // active to initiate memory read (used by IO)
+   input         mem_rbusy, // asserted if memory is busy reading value
+   input         mem_wbusy, // asserted if memory is busy writing value
+
+   input         interrupt_request,
+
+   input         reset      // set to 0 to reset the processor
+);
+
+   // Flip a 32 bit word. Used by the shifter (a single shifter for
+   // left and right shifts, saves silicium !)
+   function [31:0] flip32;
+      input [31:0] x;
+      flip32 = {x[ 0], x[ 1], x[ 2], x[ 3], x[ 4], x[ 5], x[ 6], x[ 7], 
+		x[ 8], x[ 9], x[10], x[11], x[12], x[13], x[14], x[15], 
+		x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23],
+		x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
+   endfunction
+
+   parameter RESET_ADDR       = 32'h00000000;
+   parameter ADDR_WIDTH       = 24;
+
+   localparam ADDR_PAD = {(32-ADDR_WIDTH){1'b0}}; // 32-bits padding for addrs
+
+   /***************************************************************************/
+   // Instruction decoding.
+   /***************************************************************************/
+
+   // Reference: Table page 104 of:
+   // https://content.riscv.org/wp-content/uploads/2017/05/riscv-spec-v2.2.pdf
+
+   wire [2:0] funct3 = instr[14:12];
+   
+   // The ALU function, decoded in 1-hot form (doing so reduces LUT count)
+   // It is used as follows: funct3Is[val] <=> funct3 == val
+   (* onehot *) wire [7:0] funct3Is = 8'b00000001 << instr[14:12];
+
+   // The five imm formats, see RiscV reference (link above), Fig. 2.4 p. 12
+   wire [31:0] Uimm={    instr[31],   instr[30:12], {12{1'b0}}};
+   wire [31:0] Iimm={{21{instr[31]}}, instr[30:20]};
+   /* verilator lint_off UNUSED */ // MSBs of SBJimms not used by addr adder.
+   wire [31:0] Simm={{21{instr[31]}}, instr[30:25],instr[11:7]};
+   wire [31:0] Bimm={{20{instr[31]}}, instr[7],instr[30:25],instr[11:8],1'b0};
+   wire [31:0] Jimm={{12{instr[31]}}, instr[19:12],instr[20],instr[30:21],1'b0};
+   /* verilator lint_on UNUSED */
+
+   // Base RISC-V (RV32I) has only 10 different instructions !
+   wire isLoad    =  (instr[6:3] == 4'b0000 ); // rd <-mem[rs1+Iimm] (bit 2:FLW)
+   wire isALUimm  =  (instr[6:2] == 5'b00100); // rd <- rs1 OP Iimm   
+   wire isAUIPC   =  (instr[6:2] == 5'b00101); // rd <- PC + Uimm
+   wire isStore   =  (instr[6:3] == 4'b0100 ); // mem[rs1+Simm]<-rs2 (bit 2:FSW)
+   wire isALUreg  =  (instr[6:2] == 5'b01100); // rd <- rs1 OP rs2
+   wire isLUI     =  (instr[6:2] == 5'b01101); // rd <- Uimm
+   wire isBranch  =  (instr[6:2] == 5'b11000); // if(rs1 OP rs2) PC<-PC+Bimm
+   wire isJALR    =  (instr[6:2] == 5'b11001); // rd <- PC+4; PC<-rs1+Iimm
+   wire isJAL     =  (instr[6:2] == 5'b11011); // rd <- PC+4; PC<-PC+Jimm
+   wire isSYSTEM  =  (instr[6:2] == 5'b11100); // rd <- CSR <- rs1/uimm5
+   wire isFPU     =  (instr[6:5] == 2'b10);    // all FPU instr except FLW/FSW
+   
+   wire isALU = isALUimm | isALUreg;
+
+   /***************************************************************************/
+   // The register file.
+   /***************************************************************************/
+
+   reg [31:0] rs1;
+   reg [31:0] rs2;
+   reg [31:0] rs3; // this one is used by the FMA instructions.
+   
+   reg [31:0] registerFile [63:0]; //  0..31: integer registers
+                                   // 32..63: floating-point registers
+   
+   /***************************************************************************/
+   // The ALU. Does operations and tests combinatorially, except divisions.
+   /***************************************************************************/
+
+   // First ALU source, always rs1
+   wire [31:0] aluIn1 = rs1;
+
+   // Second ALU source, depends on opcode:
+   //    ALUreg, Branch:     rs2
+   //    ALUimm, Load, JALR: Iimm
+   wire [31:0] aluIn2 = isALUreg | isBranch ? rs2 : Iimm;
+
+   wire aluWr; // ALU write strobe, starts dividing.
+
+   // The adder is used by both arithmetic instructions and JALR.
+   wire [31:0] aluPlus = aluIn1 + aluIn2;
+
+   // Use a single 33 bits subtract to do subtraction and all comparisons
+   // (trick borrowed from swapforth/J1)
+   wire [32:0] aluMinus = {1'b1, ~aluIn2} + {1'b0,aluIn1} + 33'b1;
+   wire        LT  = (aluIn1[31] ^ aluIn2[31]) ? aluIn1[31] : aluMinus[32];
+   wire        LTU = aluMinus[32];
+   wire        EQ  = (aluMinus[31:0] == 0);
+
+   /***************************************************************************/
+
+   // Use the same shifter both for left and right shifts by 
+   // applying bit reversal
+
+   wire [31:0] shifter_in = funct3Is[1] ? flip32(aluIn1) : aluIn1;
+   
+   /* verilator lint_off WIDTH */
+   wire [31:0] shifter = 
+               $signed({instr[30] & aluIn1[31], shifter_in}) >>> aluIn2[4:0];
+   /* verilator lint_on WIDTH */
+
+   wire [31:0] leftshift = flip32(shifter);
+   
+   /***************************************************************************/
+
+   wire funcM     = instr[25];
+   wire isDivide  = isALUreg & funcM & instr[14];
+   wire aluBusy   = |div_cnt; // ALU is busy if division is in progress.
+
+   // funct3: 1->MULH, 2->MULHSU  3->MULHU
+   wire isMULH   = funct3Is[1];
+   wire isMULHSU = funct3Is[2];
+
+   wire sign1 = aluIn1[31] &  isMULH;
+   wire sign2 = aluIn2[31] & (isMULH | isMULHSU);
+
+   wire signed [32:0] signed1 = {sign1, aluIn1};
+   wire signed [32:0] signed2 = {sign2, aluIn2};
+
+   wire signed [63:0]  multiply = signed1 * signed2;      
+   
+   /***************************************************************************/
+
+   // Notes:
+   // - instr[30] is 1 for SUB and 0 for ADD
+   // - for SUB, need to test also instr[5] to discriminate ADDI:
+   //    (1 for ADD/SUB, 0 for ADDI, and Iimm used by ADDI overlaps bit 30 !)
+   // - instr[30] is 1 for SRA (do sign extension) and 0 for SRL
+
+   wire [31:0] aluOut_base =
+     (funct3Is[0]  ? instr[30] & instr[5] ? aluMinus[31:0] : aluPlus : 32'b0) |
+     (funct3Is[1]  ? leftshift                                       : 32'b0) |
+     (funct3Is[2]  ? {31'b0, LT}                                     : 32'b0) |
+     (funct3Is[3]  ? {31'b0, LTU}                                    : 32'b0) |
+     (funct3Is[4]  ? aluIn1 ^ aluIn2                                 : 32'b0) |
+     (funct3Is[5]  ? shifter                                         : 32'b0) |
+     (funct3Is[6]  ? aluIn1 | aluIn2                                 : 32'b0) |
+     (funct3Is[7]  ? aluIn1 & aluIn2                                 : 32'b0) ;
+
+   reg [31:0]  aluOut_mul;
+   always @(posedge clk) begin
+      aluOut_mul <= funct3Is[0] ? multiply[31:0] : multiply[63:32];
+   end
+
+   reg [31:0]  aluOut_div;
+   always @(posedge clk) begin
+      (* parallel_case, full_case *)
+      case(1'b1)
+	 instr[13] &  div_sign: aluOut_div <= -dividend;
+	 instr[13] & !div_sign: aluOut_div <=  dividend;
+	!instr[13] &  div_sign: aluOut_div <= -quotient;
+	!instr[13] & !div_sign: aluOut_div <=  quotient;	
+      endcase
+   end
+
+   reg [31:0] aluOut;
+   always @(*) begin
+      (* parallel_case *)
+      case(1'b1)
+	isALUreg & funcM &  instr[14]: aluOut = aluOut_div;
+	isALUreg & funcM & !instr[14]: aluOut = aluOut_mul;
+	default: aluOut = aluOut_base;
+      endcase
+   end
+    
+   /***************************************************************************/
+   // Implementation of DIV/REM instructions, highly inspired by PicoRV32
+
+   reg [31:0] dividend;
+   reg [62:0] divisor;
+   reg [31:0] quotient;
+   reg [5:0]  div_cnt;
+   reg div_sign;
+   
+   always @(posedge clk) begin
+      if (aluWr) begin
+	 div_sign <= ~instr[12] & (instr[13] ? aluIn1[31] : 
+                                  (aluIn1[31] != aluIn2[31]) & |aluIn2);
+         dividend <=   ~instr[12] & aluIn1[31] ? -aluIn1 : aluIn1;
+         divisor  <= {(~instr[12] & aluIn2[31] ? -aluIn2 : aluIn2), 31'b0};
+         quotient <= 0;
+	 div_cnt <= isDivide ? 33 : 0; // one additional cycle for aluOut_div
+      end else begin
+	 if(aluBusy) div_cnt <= div_cnt - 1;
+      end
+      if(|div_cnt[5:1]) begin
+         divisor <= divisor >> 1;
+	 if(divisor <= {31'b0, dividend}) begin
+	    quotient <= {quotient[30:0],1'b1};
+	    dividend <= dividend - divisor[31:0];
+	 end else begin
+	    quotient <= {quotient[30:0],1'b0};	    
+	 end
+      end
+   end 
+
+   /***************************************************************************/
+   // The predicate for conditional branches.
+
+   wire predicate = funct3Is[0] &  EQ  | // BEQ
+                    funct3Is[1] & !EQ  | // BNE
+                    funct3Is[4] &  LT  | // BLT
+                    funct3Is[5] & !LT  | // BGE
+                    funct3Is[6] &  LTU | // BLTU
+                    funct3Is[7] & !LTU ; // BGEU
+
+   /***************************************************************************/
+   // Registers read-write 
+   /***************************************************************************/
+
+   always @(posedge clk) begin
+      if(state[WAIT_INSTR_bit]) begin
+	 // Fetch registers as soon as instruction is ready.
+	 rs1 <= registerFile[{raw_rs1IsFP,raw_instr[19:15]}]; 
+	 rs2 <= registerFile[{raw_rs2IsFP,raw_instr[24:20]}];
+	 rs3 <= registerFile[{1'b1,       raw_instr[31:27]}];
+      end else if(state[DECOMPRESS_GETREGS_bit]) begin
+	 // For compressed instructions, fetch registers once decompressed.
+	 rs1 <= registerFile[{decomp_rs1IsFP,instr[19:15]}];
+	 rs2 <= registerFile[{decomp_rs2IsFP,instr[24:20]}];
+	 // no need to fetch rs3 here, there is no compressed FMA.
+      end else if(writeBack & !fpuBusy) begin
+	 if(rdIsFP || |instr[11:7]) begin
+            registerFile[{rdIsFP,instr[11:7]}] <= writeBackData;
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The FPU 
+   /***************************************************************************/
+
+   wire fpuBusy;
+   wire [31:0] fpuOut;
+   PetitBateau FPU(
+      .clk(clk),
+      .wr(state[EXECUTE_bit] & isFPU),
+      .instr(instr[31:2]),
+      .rs1(rs1),
+      .rs2(rs2),
+      .rs3(rs3),
+      .busy(fpuBusy),		   
+      .out(fpuOut)		   
+   );
+   
+   // There is a single register bank, registers 0..31 are the integer
+   // registers, and 32..63 are the floating point registers, hence
+   // bit 5 of rs1,rs2,rd index is set to 0 for an integer register
+   // and 1 for a fp register. 
+
+   // asserted if the destination register is a floating-point register
+   wire rdIsFP = (instr[6:2] == 5'b00001)             || // FLW
+	         (instr[6:4] == 3'b100  )             || // F{N}MADD,F{N}MSUB
+	         (instr[6:4] == 3'b101 && (
+                            (instr[31]    == 1'b0)    || // R-Type FPU
+			    (instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+			    (instr[31:28] == 4'b1111)    // FMV.W.X 
+			 )
+                 );
+
+   // rs1 is a FP register if instr[6:5] = 2'b10 except for:
+   //   FCVT.S.W{U}:  instr[6:2] = 5'b10100 and instr[30:28] = 3'b101
+   //   FMV.W.X    :  instr[6:2] = 5'b10100 and instr[30:28] = 3'b111
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire raw_rs1IsFP = (raw_instr[6:5]   == 2'b10 ) &&  
+                     !((raw_instr[4:2]  == 3'b100) && (
+                      (raw_instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+     	              (raw_instr[31:28] == 4'b1111)    // FMV.W.X
+                    )						    
+		  );
+
+   wire decomp_rs1IsFP = (instr[6:5]   == 2'b10 ) &&  
+                     !((instr[4:2]  == 3'b100) && (
+                      (instr[31:28] == 4'b1101) || // FCVT.S.W{U}
+     	              (instr[31:28] == 4'b1111)    // FMV.W.X
+                    )						    
+		  );
+   
+   // rs2 is a FP register if instr[6:5] = 2'b10 or instr is FSW
+   // (two versions of the signal, one for regular instruction decode,
+   //  the other one for compressed instructions).
+   wire raw_rs2IsFP = (raw_instr[6:5] == 2'b10) || (raw_instr[6:2]==5'b01001);
+   wire decomp_rs2IsFP =  (instr[6:5] == 2'b10) || (instr[6:2]==5'b01001);   
+
+   /***************************************************************************/
+   // Program counter and branch target computation.
+   /***************************************************************************/
+
+   reg  [ADDR_WIDTH-1:0] PC; // The program counter.
+   reg  [31:2] instr;        // Latched instruction. Note that bits 0 and 1 are
+                             // ignored (not used in RV32I base instr set).
+
+   wire [ADDR_WIDTH-1:0] PCplus2 = PC + 2;
+   wire [ADDR_WIDTH-1:0] PCplus4 = PC + 4;
+   wire [ADDR_WIDTH-1:0] PCinc   = long_instr ? PCplus4 : PCplus2;
+
+   // An adder used to compute branch address, JAL address and AUIPC.
+   // branch->PC+Bimm    AUIPC->PC+Uimm    JAL->PC+Jimm
+   // Equivalent to PCplusImm = PC + (isJAL ? Jimm : isAUIPC ? Uimm : Bimm)
+   wire [ADDR_WIDTH-1:0] PCplusImm = PC + ( instr[3] ? Jimm[ADDR_WIDTH-1:0] :
+                                            instr[4] ? Uimm[ADDR_WIDTH-1:0] :
+                                                       Bimm[ADDR_WIDTH-1:0] );
+
+   // A separate adder to compute the destination of load/store.
+   // testing instr[5] is equivalent to testing isStore in this context.
+   wire [ADDR_WIDTH-1:0] loadstore_addr = rs1[ADDR_WIDTH-1:0] +
+                   (instr[5] ? Simm[ADDR_WIDTH-1:0] : Iimm[ADDR_WIDTH-1:0]);
+
+   assign mem_addr = {ADDR_PAD,
+                       state[WAIT_INSTR_bit] | state[FETCH_INSTR_bit] ?
+                       fetch_second_half ? {PCplus4[ADDR_WIDTH-1:2], 2'b00}
+                                         : {PC     [ADDR_WIDTH-1:2], 2'b00}
+                       : loadstore_addr
+                     };
+
+   /***************************************************************************/
+   // Interrupt logic, CSR registers and opcodes.
+   /***************************************************************************/
+
+   // Remember interrupt requests as they are not checked for every cycle
+   reg  interrupt_request_sticky;
+   
+   // Interrupt enable and lock logic
+   wire interrupt = interrupt_request_sticky & mstatus & ~mcause;
+
+   // Processor accepts interrupts in EXECUTE state.   
+   wire interrupt_accepted = interrupt & state[EXECUTE_bit];        
+
+   // If current interrupt is accepted, there already might be the next one,
+   //  which should not be missed:
+   always @(posedge clk) begin
+     interrupt_request_sticky <= 
+         interrupt_request | (interrupt_request_sticky & ~interrupt_accepted);
+   end
+
+   // Decoder for mret opcode
+   wire interrupt_return = isSYSTEM & funct3Is[0]; // & (instr[31:20]==12'h302);
+
+   // CSRs:
+   reg  [ADDR_WIDTH-1:0] mepc;    // The saved program counter.
+   reg  [ADDR_WIDTH-1:0] mtvec;   // The address of the interrupt handler.
+   reg                   mstatus; // Interrupt enable
+   reg                   mcause;  // Interrupt cause (and lock)
+   reg  [63:0]           cycles;  // Cycle counter
+
+   always @(posedge clk) cycles <= cycles + 1;
+
+   wire sel_mstatus = (instr[31:20] == 12'h300);
+   wire sel_mtvec   = (instr[31:20] == 12'h305);
+   wire sel_mepc    = (instr[31:20] == 12'h341);
+   wire sel_mcause  = (instr[31:20] == 12'h342);
+   wire sel_cycles  = (instr[31:20] == 12'hC00);
+   wire sel_cyclesh = (instr[31:20] == 12'hC80);
+
+   // Read CSRs
+   wire [31:0] CSR_read =
+     (sel_mstatus ?    {28'b0, mstatus, 3'b0}  : 32'b0) |
+     (sel_mtvec   ? {ADDR_PAD, mtvec}          : 32'b0) |
+     (sel_mepc    ? {ADDR_PAD, mepc }          : 32'b0) |
+     (sel_mcause  ?            {mcause, 31'b0} : 32'b0) |
+     (sel_cycles  ?            cycles[31:0]    : 32'b0) |
+     (sel_cyclesh ?            cycles[63:32]   : 32'b0) ;
+
+
+   // Write CSRs: 5 bit unsigned immediate or content of RS1
+   wire [31:0] CSR_modifier = instr[14] ? {27'd0, instr[19:15]} : rs1; 
+
+   wire [31:0] CSR_write = (instr[13:12] == 2'b10) ? CSR_modifier | CSR_read  :
+                           (instr[13:12] == 2'b11) ? ~CSR_modifier & CSR_read :
+                        /* (instr[13:12] == 2'b01) ? */  CSR_modifier ;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+	 mstatus <= 0;
+      end else begin
+	 // Execute a CSR opcode
+	 if (isSYSTEM & (instr[14:12] != 0) & state[EXECUTE_bit]) begin
+	    if (sel_mstatus) mstatus <= CSR_write[3];
+	    if (sel_mtvec  ) mtvec   <= CSR_write[ADDR_WIDTH-1:0];
+	 end
+      end
+   end
+
+   /***************************************************************************/
+   // The value written back to the register file.
+   /***************************************************************************/
+
+   wire [31:0] writeBackData  =
+      (isSYSTEM            ? CSR_read             : 32'b0) |  // SYSTEM
+      (isLUI               ? Uimm                 : 32'b0) |  // LUI
+      (isALU               ? aluOut               : 32'b0) |  // ALUreg, ALUimm
+      (isFPU               ? fpuOut               : 32'b0) |  // FPU
+      (isAUIPC             ? {ADDR_PAD,PCplusImm} : 32'b0) |  // AUIPC
+      (isJALR   | isJAL    ? {ADDR_PAD,PCinc    } : 32'b0) |  // JAL, JALR
+      (isLoad              ? LOAD_data            : 32'b0);   // Load
+
+   /***************************************************************************/
+   // LOAD/STORE
+   /***************************************************************************/
+
+   // All memory accesses are aligned on 32 bits boundary. For this
+   // reason, we need some circuitry that does unaligned halfword
+   // and byte load/store, based on:
+   // - funct3[1:0]:  00->byte 01->halfword 10->word
+   // - mem_addr[1:0]: indicates which byte/halfword is accessed
+
+   // TODO: support unaligned accesses for FLW and FSW 
+   
+   // instr[2] is set for FLW and FSW. instr[13:12] = func3[1:0]
+   wire mem_byteAccess     = !instr[2] && (instr[13:12] == 2'b00); 
+   wire mem_halfwordAccess = !instr[2] && (instr[13:12] == 2'b01); 
+
+   // LOAD, in addition to funct3[1:0], LOAD depends on:
+   // - funct3[2] (instr[14]): 0->do sign expansion   1->no sign expansion
+
+   wire LOAD_sign =
+        !instr[14] & (mem_byteAccess ? LOAD_byte[7] : LOAD_halfword[15]);
+
+   wire [31:0] LOAD_data =
+         mem_byteAccess ? {{24{LOAD_sign}},     LOAD_byte} :
+     mem_halfwordAccess ? {{16{LOAD_sign}}, LOAD_halfword} :
+                          mem_rdata ;
+
+   wire [15:0] LOAD_halfword =
+               loadstore_addr[1] ? mem_rdata[31:16] : mem_rdata[15:0];
+
+   wire  [7:0] LOAD_byte =
+               loadstore_addr[0] ? LOAD_halfword[15:8] : LOAD_halfword[7:0];
+
+   // STORE
+   assign mem_wdata[ 7: 0] = rs2[7:0];
+   assign mem_wdata[15: 8] = loadstore_addr[0] ? rs2[7:0]  : rs2[15: 8];
+   assign mem_wdata[23:16] = loadstore_addr[1] ? rs2[7:0]  : rs2[23:16];
+   assign mem_wdata[31:24] = loadstore_addr[0] ? rs2[7:0]  :
+                             loadstore_addr[1] ? rs2[15:8] : rs2[31:24];
+
+   // The memory write mask:
+   //    1111                     if writing a word
+   //    0011 or 1100             if writing a halfword
+   //                                (depending on loadstore_addr[1])
+   //    0001, 0010, 0100 or 1000 if writing a byte
+   //                                (depending on loadstore_addr[1:0])
+
+   wire [3:0] STORE_wmask =
+              mem_byteAccess      ?
+                    (loadstore_addr[1] ?
+                          (loadstore_addr[0] ? 4'b1000 : 4'b0100) :
+                          (loadstore_addr[0] ? 4'b0010 : 4'b0001)
+                    ) :
+              mem_halfwordAccess ?
+                    (loadstore_addr[1] ? 4'b1100 : 4'b0011) :
+              4'b1111;
+
+   /***************************************************************************/
+   // Unaligned fetch mechanism and compressed opcode handling
+   /***************************************************************************/
+
+   reg [ADDR_WIDTH-1:2] cached_addr;
+   reg           [31:0] cached_data;
+
+   wire current_cache_hit = cached_addr == PC     [ADDR_WIDTH-1:2];
+   wire    next_cache_hit = cached_addr == PC_new [ADDR_WIDTH-1:2];
+
+   wire current_unaligned_long = &cached_mem [17:16] & PC    [1];
+   wire    next_unaligned_long = &cached_data[17:16] & PC_new[1];
+
+   reg fetch_second_half;
+   reg long_instr;
+
+   wire [31:0] cached_mem   = current_cache_hit ? cached_data : mem_rdata;
+   wire [31:0] raw_instr = PC[1] ? {mem_rdata[15:0], cached_mem[31:16]} 
+                                    : cached_mem;
+   wire [31:0] decompressed;
+   decompressor _decomp ( .c(raw_instr[15:0]), .d(decompressed) );
+   
+   /*************************************************************************/
+   // And, last but not least, the state machine.
+   /*************************************************************************/
+
+   localparam FETCH_INSTR_bit          = 0;
+   localparam WAIT_INSTR_bit           = 1;
+   localparam DECOMPRESS_GETREGS_bit   = 2;   
+   localparam EXECUTE_bit              = 3;
+   localparam WAIT_ALU_OR_MEM_bit      = 4;
+   localparam WAIT_ALU_OR_MEM_SKIP_bit = 5;
+
+   localparam NB_STATES                = 6;
+
+   localparam FETCH_INSTR          = 1 << FETCH_INSTR_bit;
+   localparam WAIT_INSTR           = 1 << WAIT_INSTR_bit;
+   localparam DECOMPRESS_GETREGS   = 1 << DECOMPRESS_GETREGS_bit;   
+   localparam EXECUTE              = 1 << EXECUTE_bit;
+   localparam WAIT_ALU_OR_MEM      = 1 << WAIT_ALU_OR_MEM_bit;
+   localparam WAIT_ALU_OR_MEM_SKIP = 1 << WAIT_ALU_OR_MEM_SKIP_bit;
+
+   (* onehot *)
+   reg [NB_STATES-1:0] state;
+
+   // The signals (internal and external) that are determined
+   // combinatorially from state and other signals.
+
+   // register write-back enable.
+   wire writeBack = ~(isBranch | isStore ) & !fpuBusy & (
+            state[EXECUTE_bit] | 
+	    state[WAIT_ALU_OR_MEM_bit] | 
+            state[WAIT_ALU_OR_MEM_SKIP_bit]
+   );
+
+   // The memory-read signal.
+   assign mem_rstrb = state[EXECUTE_bit] & isLoad | state[FETCH_INSTR_bit];
+
+   // The mask for memory-write.
+   assign mem_wmask = {4{state[EXECUTE_bit] & isStore}} & STORE_wmask;
+
+   // aluWr starts computation (divide) in the ALU.
+   assign aluWr = state[EXECUTE_bit] & isALU;
+
+   wire jumpToPCplusImm = isJAL | (isBranch & predicate);
+
+   wire needToWait = isLoad | 
+                    (isStore & `NRV_IS_IO_ADDR(mem_addr)) | 
+                     isALUreg & funcM  /* isDivide */ | 
+                     isFPU;  
+
+   wire [ADDR_WIDTH-1:0] PC_new = 
+           isJALR           ? {aluPlus[ADDR_WIDTH-1:1],1'b0} :
+           jumpToPCplusImm  ? PCplusImm :
+           interrupt_return ? mepc :
+                              PCinc;
+
+   always @(posedge clk) begin
+      if(!reset) begin
+         state             <= WAIT_ALU_OR_MEM;     //Just waiting for !mem_wbusy
+         PC                <= RESET_ADDR[ADDR_WIDTH-1:0];
+         mcause            <= 0;
+         cached_addr       <= {ADDR_WIDTH-2{1'b1}};//Needs to be an invalid addr
+         fetch_second_half <= 0;
+      end else begin
+
+	 // See note [1] at the end of this file.
+	 (* parallel_case *)
+	 case(1'b1)
+
+           state[WAIT_INSTR_bit]: begin
+              if(!mem_rbusy) begin // may be high when executing from SPI flash
+		 // Update cache
+		 if (~current_cache_hit | fetch_second_half) begin
+                    cached_addr <= mem_addr[ADDR_WIDTH-1:2];
+                    cached_data <= mem_rdata;
+		 end;
+
+		 // Decode instruction
+		 // Registers are fetched at the same time, in the
+		 // FPU's always block.
+		 instr  <= &raw_instr[1:0] ? raw_instr[31:2] 
+                                           : decompressed[31:2];
+		 long_instr <= &raw_instr[1:0];
+
+		 // Long opcode, unaligned, first part fetched, 
+		 // happens in non-linear code
+		 if (current_unaligned_long & ~fetch_second_half) begin
+                    fetch_second_half <= 1;
+                    state <= FETCH_INSTR;
+		 end else begin
+                    fetch_second_half <= 0;
+                    state <= &raw_instr[1:0] ? EXECUTE : DECOMPRESS_GETREGS;
+		 end
+              end
+           end
+
+           state[DECOMPRESS_GETREGS_bit]: begin
+	      // All the registers are fetched in FPU's always block.
+	      state <= EXECUTE;
+	   end
+	   
+           state[EXECUTE_bit]: begin
+              if (interrupt) begin
+		 PC     <= mtvec;
+		 mepc   <= PC_new;
+		 mcause <= 1;
+		 state  <= needToWait ? WAIT_ALU_OR_MEM : FETCH_INSTR;
+              end else begin
+		 // Unaligned load/store not implemented yet
+		 // (the norm supposes that FLW and FSW can handle them)
+		 `ASSERT(
+                     !((isLoad|isStore) && instr[2] && |loadstore_addr[1:0]), 
+		     ("PC=%x UNALIGNED FLW/FSW",PC)
+                 );
+		 
+		 PC <= PC_new;
+		 if (interrupt_return) mcause <= 0;
+
+		 state <= next_cache_hit & ~next_unaligned_long
+  		        ? (needToWait ? WAIT_ALU_OR_MEM_SKIP : WAIT_INSTR)
+			: (needToWait ? WAIT_ALU_OR_MEM      : FETCH_INSTR);
+
+		 fetch_second_half <= next_cache_hit & next_unaligned_long;
+              end
+           end
+
+           state[WAIT_ALU_OR_MEM_bit]: begin
+              if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
+                 state <= FETCH_INSTR;
+	      end
+           end
+
+           state[WAIT_ALU_OR_MEM_SKIP_bit]: begin
+              if(!aluBusy & !fpuBusy & !mem_rbusy & !mem_wbusy) begin
+                 state <= WAIT_INSTR;
+	      end
+           end
+
+           default: begin // FETCH_INSTR
+              state <= WAIT_INSTR;
+           end
+	 endcase 
+      end
+   end
+
+`ifdef BENCH
+   initial begin
+      cycles = 0;
+      registerFile[0] = 0;
+   end
+`endif
+
+endmodule
+
+/*****************************************************************************/
+
+module decompressor(
+   input  wire [15:0] c,
+   output reg  [31:0] d
+);
+
+   // Notes: * replaced illegal, unknown, x0, x1, x2 with
+   //   'localparam' instead of 'wire='
+   //        * could split decoding into multiple cycles
+   //   if decompressor is a bottleneck
+   
+   // How to handle illegal and unknown opcodes
+   localparam illegal = 32'h0;
+   localparam unknown = 32'h0;
+
+   // Register decoder
+
+   wire [4:0] rcl = {2'b01, c[4:2]}; // Register compressed low
+   wire [4:0] rch = {2'b01, c[9:7]}; // Register compressed high
+
+   wire [4:0] rwl  = c[ 6:2];  // Register wide low
+   wire [4:0] rwh  = c[11:7];  // Register wide high
+
+   localparam x0 = 5'b00000;
+   localparam x1 = 5'b00001;
+   localparam x2 = 5'b00010;   
+   
+   // Immediate decoder
+
+   wire  [4:0]    shiftImm = c[6:2];
+
+   wire [11:0] addi4spnImm = {2'b00, c[10:7], c[12:11], c[5], c[6], 2'b00};
+   wire [11:0]     lwswImm = {5'b00000, c[5], c[12:10] , c[6], 2'b00};
+   wire [11:0]     lwspImm = {4'b0000, c[3:2], c[12], c[6:4], 2'b00};
+   wire [11:0]     swspImm = {4'b0000, c[8:7], c[12:9], 2'b00};
+
+   wire [11:0] addi16spImm = {{ 3{c[12]}}, c[4:3], c[5], c[2], c[6], 4'b0000};
+   wire [11:0]      addImm = {{ 7{c[12]}}, c[6:2]};
+
+   /* verilator lint_off UNUSED */
+   wire [12:0]        bImm = {{ 5{c[12]}}, c[6:5], c[2], c[11:10], c[4:3], 1'b0};
+   wire [20:0]      jalImm = {{10{c[12]}}, c[8], c[10:9], c[6], c[7], c[2], c[11], c[5:3], 1'b0};
+   wire [31:0]      luiImm = {{15{c[12]}}, c[6:2], 12'b000000000000};
+   /* verilator lint_on UNUSED */
+
+   always @*
+   casez (c[15:0])
+                                                     // imm / funct7   +   rs2  rs1     fn3                   rd    opcode
+//    16'b???___????????_???_11 : d =                                                                            c  ; // Long opcode, no need to decompress
+
+/* verilator lint_off CASEOVERLAP */   
+      16'b000___00000000_000_00 : d =                                                                       illegal ; // c.illegal   -->  illegal
+      16'b000___????????_???_00 : d = {      addi4spnImm,             x2, 3'b000,                 rcl, 7'b00100_11} ; // c.addi4spn  -->  addi rd', x2, nzuimm[9:2]
+/* verilator lint_on CASEOVERLAP */
+     
+      16'b010_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00000_11} ; // c.lw        -->  lw   rd', offset[6:2](rs1')
+      16'b110_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01000_11} ; // c.sw        -->  sw   rs2', offset[6:2](rs1')
+
+      
+      16'b000_???_???_??_???_01 : d = {           addImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi      -->  addi rd, rd, nzimm[5:0]
+      16'b001____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x1, 7'b11011_11} ; // c.jal       -->  jal  x1, offset[11:1]
+      16'b010__?_?????_?????_01 : d = {           addImm,             x0, 3'b000,                 rwh, 7'b00100_11} ; // c.li        -->  addi rd, x0, imm[5:0]
+      16'b011__?_00010_?????_01 : d = {      addi16spImm,            rwh, 3'b000,                 rwh, 7'b00100_11} ; // c.addi16sp  -->  addi x2, x2, nzimm[9:4]
+      16'b011__?_?????_?????_01 : d = {    luiImm[31:12],                                         rwh, 7'b01101_11} ; // c.lui       -->  lui  rd, nzuimm[17:12]
+      16'b100_?_00_???_?????_01 : d = {       7'b0000000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srli      -->  srli rd', rd', shamt[5:0]
+      16'b100_?_01_???_?????_01 : d = {       7'b0100000,  shiftImm, rch, 3'b101,                 rch, 7'b00100_11} ; // c.srai      -->  srai rd', rd', shamt[5:0]
+      16'b100_?_10_???_?????_01 : d = {           addImm,            rch, 3'b111,                 rch, 7'b00100_11} ; // c.andi      -->  andi rd', rd', imm[5:0]
+      16'b100_011_???_00_???_01 : d = {       7'b0100000,       rcl, rch, 3'b000,                 rch, 7'b01100_11} ; // c.sub       -->  sub  rd', rd', rs2'
+      16'b100_011_???_01_???_01 : d = {       7'b0000000,       rcl, rch, 3'b100,                 rch, 7'b01100_11} ; // c.xor       -->  xor  rd', rd', rs2'
+      16'b100_011_???_10_???_01 : d = {       7'b0000000,       rcl, rch, 3'b110,                 rch, 7'b01100_11} ; // c.or        -->  or   rd', rd', rs2'
+      16'b100_011_???_11_???_01 : d = {       7'b0000000,       rcl, rch, 3'b111,                 rch, 7'b01100_11} ; // c.and       -->  and  rd', rd', rs2'
+      16'b101____???????????_01 : d = {     jalImm[20], jalImm[10:1], jalImm[11], jalImm[19:12],   x0, 7'b11011_11} ; // c.j         -->  jal  x0, offset[11:1]
+      16'b110__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b000, bImm[4:1], bImm[11], 7'b11000_11} ; // c.beqz      -->  beq  rs1', x0, offset[8:1]
+      16'b111__???_???_?????_01 : d = {bImm[12], bImm[10:5],     x0, rch, 3'b001, bImm[4:1], bImm[11], 7'b11000_11} ; // c.bnez      -->  bne  rs1', x0, offset[8:1]
+
+      16'b000__?_?????_?????_10 : d = {        7'b0000000, shiftImm, rwh, 3'b001,                 rwh, 7'b00100_11} ; // c.slli      -->  slli rd, rd, shamt[5:0]
+      16'b010__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00000_11} ; // c.lwsp      -->  lw   rd, offset[7:2](x2)
+      16'b100__0_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x0, 7'b11001_11} ; // c.jr        -->  jalr x0, rs1, 0
+      16'b100__0_?????_?????_10 : d = {        7'b0000000,      rwl,  x0, 3'b000,                 rwh, 7'b01100_11} ; // c.mv        -->  add  rd, x0, rs2
+   // 16'b100__1_00000_00000_10 : d = {                              25'b00000000_00010000_00000000_0, 7'b11100_11} ; // c.ebreak    -->  ebreak
+      16'b100__1_?????_00000_10 : d = {  12'b000000000000,           rwh, 3'b000,                  x1, 7'b11001_11} ; // c.jalr      -->  jalr x1, rs1, 0
+      16'b100__1_?????_?????_10 : d = {        7'b0000000,      rwl, rwh, 3'b000,                 rwh, 7'b01100_11} ; // c.add       -->  add  rd, rd, rs2
+      16'b110__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01000_11} ; // c.swsp      -->  sw   rs2, offset[7:2](x2)
+
+      // Four compressed RV32F load/store instructions
+      16'b011_???_???_??_???_00 : d = {          lwswImm,            rch, 3'b010,                 rcl, 7'b00001_11} ; // c.flw       -->  flw   rd', offset[6:2](rs1')
+      16'b111_???_???_??_???_00 : d = {    lwswImm[11:5],       rcl, rch, 3'b010,        lwswImm[4:0], 7'b01001_11} ; // c.fsw       -->  fsw   rs2', offset[6:2](rs1')
+      16'b011__?_?????_?????_10 : d = {           lwspImm,            x2, 3'b010,                 rwh, 7'b00001_11} ; // c.flwsp     -->  flw   rd, offset[7:2](x2)
+      16'b111__?_?????_?????_10 : d = {     swspImm[11:5],      rwl,  x2, 3'b010,        swspImm[4:0], 7'b01001_11} ; // c.fswsp     -->  fsw   rs2, offset[7:2](x2)
+      
+
+//      default:                    d =                                                                       unknown ; // Unknown opcode
+     default: d = 32'bXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX;
+   endcase
+endmodule
+
+/*****************************************************************************/
@@ -0,0 +1,856 @@
+/******************************************************************************/
+// FemtoRV32, a collection of minimalistic RISC-V RV32 cores.
+//
+// PetitBateau (make it float): a simple single-precision RISC-V FPU
+//   Mission statement: achieve a good area/performance ratio, by
+//   implementing a full-precision FMA (48 bits), and micro-programmed
+//   Newton-Raphson for FDIV and FSQRT (that reuse the FMA).
+// 
+// Rounding works as follows:
+// - all subnormals are flushed to zero
+// - FADD, FSUB, FMUL, FMADD, FMSUB, FNMADD, FNMSUB: IEEE754 round to zero
+// - FDIV and FSQRT do not have correct rounding
+//   if PRECISE_DIV is set (default), then FDIV rounding is validated in 
+//      tinyraytracer test. Complete proof remains to be done
+//
+// [TODO] add FPU CSR (and instret for perf stat)]
+// [TODO] correct IEEE754 round to zero for FDIV and FSQRT
+// [TODO] support IEEE754 denormals
+// [TODO] NaNs propagation and infinity
+// [TODO] support all IEEE754 rounding modes
+//
+// Bruno Levy, 2021
+/******************************************************************************/
+
+// TODO: instead of mux between A,B,C and FMA, make FMA always compute
+//       A*B+C and mux rs1,rs2,rs3,1.0,0.0 to A,B,C based on instr (mux
+//       will be more complicated but will probably reduce overall
+//       critical path) ?
+// TODO: there are too many different paths between the internal registers,
+//       maybe micro-instructions could be redesigned with this in mind.
+//       A could be the MSBs of X, avoiding all MV_A_X instructions.
+// TODO: the necessity to copy rs1 in E without flushing denormals for
+//       the int-to-fp instructions is unelegant.
+
+// Include guard for LiteX
+`ifndef PETITBATEAU_INCLUDED
+`define PETITBATEAU_INCLUDED
+
+// Check condition and display message in simulation
+`ifdef BENCH
+ `define ASSERT(cond,msg) if(!(cond)) $display msg
+ `define ASSERT_NOT_REACHED(msg) $display msg
+`else
+ `define ASSERT(cond,msg)
+ `define ASSERT_NOT_REACHED(msg)
+`endif
+
+module PetitBateau(
+   input 	     clk,
+   input 	     wr,    // write strobe, starts computation
+   input [31:2]      instr, // current riscv instruction		   
+
+   // operands		   
+   input [31:0]      rs1,
+   input [31:0]      rs2,
+   input [31:0]      rs3,
+
+   // outputs		   
+   output 	 busy,
+   output [31:0] out 	
+);
+
+   // Set to 1 for higher-precision FDIV (costs 30 additional cycles per FDIV)
+   parameter PRECISE_DIV = 1;
+
+   
+   // Uncomment the line below to emulate all FPU instructions in Verilator
+   // (useful to test instruction decoder and implementations of micro-instr
+   // in C++). See SIM/FPU_funcs.{h,cpp}
+//`define FPU_EMUL
+
+   // Two high-resolution registers for the FMA, that computes X+Y
+   // Register X has the accumulator / shifters / leading zero counter
+   // Normalized if first bit set is bit 47
+   // Represented number is +/- frac * 2^(exp-127-47)
+   
+   reg X_sign; reg signed [8:0] X_exp; reg signed [49:0] X_frac;
+   reg Y_sign; reg signed [8:0] Y_exp; reg signed [49:0] Y_frac;
+   
+   // FPU output = 32 MSBs of X register (see below)
+   // A macro to easily write to it (`X <= ...),
+   // used when FPU output is an integer.
+   `define X {X_sign, X_exp[7:0], X_frac[46:24]}
+   assign out = `X;
+   
+   // Five single-precision floating-point registers for internal use.
+   // A,B,C are wired to the FMA that computes either A*B+C or A+B
+   // D,E are temporaries used by FDIV and FSQRT
+   // Following IEEE754, represented number is +/- frac * 2^(exp-127-23)
+   // (127: bias  23: position of first bit set for normalized numbers)
+   reg A_sign; reg [7:0] A_exp; reg [23:0] A_frac;
+   reg B_sign; reg [7:0] B_exp; reg [23:0] B_frac;
+   reg C_sign; reg [7:0] C_exp; reg [23:0] C_frac;
+   reg D_sign; reg [7:0] D_exp; reg [23:0] D_frac;
+   reg E_sign; reg [7:0] E_exp; reg [23:0] E_frac;
+   
+   /*************************************************************************/
+
+   // Load a 32-bit value in RD
+   // RD:  one of A,B,C,D,E
+   // VAL: a 32-bit value
+   `define FP_LD32(RD,VAL)         \
+         {RD``_sign, RD``_exp, RD``_frac[22:0]} <= VAL; RD``_frac[23] <= 1'b1
+	 
+   // Load floating point value in RD by sign, exponent, fraction
+   // RD: one of A,B,C,D,E
+   // sign: 1'b1 (-) or 1'b0 (+)
+   // exp: 8-bits, biased exponent
+   // frac: 24-bit fraction
+   `define FP_LD(RD,sign,eexp,frac) \
+         {RD``_sign, RD``_exp, RD``_frac} <= {sign,eexp,frac}
+	 
+   // RD <= RS
+   // RD,RS: one of A,B,C,D,E
+   `define FP_MV(RD,RS)            \
+         {RD``_sign, RD``_exp, RD``_frac} <= {RS``_sign, RS``_exp, RS``_frac}
+
+   /** FPU micro-instructions and ROM ****************************************/
+
+   
+   localparam FPMI_READY           = 0; 
+   localparam FPMI_LOAD_XY         = 1;   // X <- A; Y <- B
+   localparam FPMI_LOAD_XY_MUL     = 2;   // X <- norm(A*B); Y <- C
+   localparam FPMI_ADD_SWAP        = 3;   // if |X|>|Y| swap(X,Y);
+                                          // if sign(X) != sign(Y) X <- -X
+   localparam FPMI_ADD_SHIFT       = 4;   // shift X to match Y exponent
+   localparam FPMI_ADD_ADD         = 5;   // X <- X + Y   
+   localparam FPMI_ADD_NORM        = 6;   // X <- norm(X) (after ADD_ADD)
+   
+   localparam FPMI_CMP             = 7;   // X <- test X,Y (FEQ,FLE,FLT)
+
+   localparam FPMI_MV_A_X          =  8;  // A <- X
+   localparam FPMI_MV_B_D          =  9;  // B <- D
+   localparam FPMI_MV_B_NH_D       = 10;  // B <- -0.5*|D|
+   localparam FPMI_MV_B_E          = 11;  // B <- E
+   localparam FPMI_MV_C_A          = 12;  // C <- A
+   localparam FPMI_MV_E_X          = 13;  // E <- X
+
+   localparam FPMI_FRCP_PROLOG     = 14;  // init reciprocal (1/x) 
+   localparam FPMI_FRCP_ITER1      = 15;  // iteration for reciprocal
+   localparam FPMI_FRCP_ITER2      = 16;  // iteration for reciprocal   
+   localparam FPMI_FRCP_EPILOG     = 17;  // epilog for reciprocal
+   localparam FPMI_FDIV_EPILOG     = 18;  // epilog for fdiv IEEE-754 rounding
+   
+   localparam FPMI_FRSQRT_PROLOG   = 19;  // init recipr sqr root (1/sqrt(x))
+   
+   localparam FPMI_FP_TO_INT       = 20;  // fpuOut <- fpoint_to_int(A)
+   localparam FPMI_INT_TO_FP       = 21;  // X <- int_to_fpoint(X)
+   localparam FPMI_MIN_MAX         = 22;  // fpuOut <- min/max(X,Y) 
+
+   localparam FPMI_LOAD_Y_ROUND    = 23;  // Y <- round to nearest
+   
+   localparam FPMI_NB              = 24;
+
+   // Instruction exit flag (if set in current micro-instr, exit microprogram)
+   localparam FPMI_EXIT_FLAG_bit   = 1+$clog2(FPMI_NB);
+   localparam FPMI_EXIT_FLAG       = 1 << FPMI_EXIT_FLAG_bit;
+   
+   reg [6:0] 	             fpmi_PC;    // current micro-instruction pointer
+   reg [1+$clog2(FPMI_NB):0] fpmi_instr; // current micro-instruction
+
+   // current micro-instruction as 1-hot: fpmi_instr == NNN <=> fpmi_is[NNN]
+   (* onehot *)
+   wire [FPMI_NB-1:0] fpmi_is = 1 << fpmi_instr[$clog2(FPMI_NB):0]; 
+   initial fpmi_PC = 0;
+   assign busy = !fpmi_is[FPMI_READY];
+
+   // Generate a micro-instructions in ROM 
+   task fpmi_gen; input [6:0] instr; begin
+      fpmi_ROM[I] = instr;
+      I = I + 1;
+   end endtask   
+
+   // Generate a FMA sequence in ROM.
+   // Use fpmi_gen_fma(0) in the middle of a micro-program
+   // Use fpmi_gen_fma(FPMI_EXIT_FLAG) if last instruction of micro-program
+   task fpmi_gen_fma; input [6:0] flags; begin
+      fpmi_gen(FPMI_LOAD_XY_MUL);      // X <- norm(A*B), Y <- C  
+      fpmi_gen(FPMI_ADD_SWAP);         // if(|X| > |Y|) swap(X,Y) (and sgn)
+      fpmi_gen(FPMI_ADD_SHIFT);        // shift X according to Y exp
+      fpmi_gen(FPMI_ADD_ADD);          // X <- X + Y
+      fpmi_gen(FPMI_ADD_NORM | flags); // X <- normalize(X)
+   end endtask
+   
+   integer I;    // current ROM location in initialization
+   integer iter; // iteration variable for generate Newton-Raphson (FDIV,FSQRT)
+   localparam FPMI_ROM_SIZE=82 + (12 + 18)*PRECISE_DIV; 
+   reg [1+$clog2(FPMI_NB):0] fpmi_ROM[0:FPMI_ROM_SIZE-1];
+   
+   // Microprograms start addresses
+   // Programatically determined when generating the ROM ('initial' block below)
+   integer FPMPROG_CMP, FPMPROG_ADD, FPMPROG_MUL, FPMPROG_MADD, FPMPROG_DIV;
+   integer FPMPROG_FP_TO_INT, FPMPROG_INT_TO_FP, FPMPROG_SQRT, FPMPROG_MIN_MAX;
+
+   // Start the definition of a microprogram (determines start address)
+   `define FPMPROG_BEGIN(prg) prg = I
+
+   // Ends the definition of a microprogram (displays stats in Verilator)
+   `ifdef BENCH
+    `define FPMPROG_END(prg) \
+        $display("#  %3d microinstructions used by %d:%s",I-prg,prg,`"prg`")
+   `else
+    `define FPMPROG_END(prg) 
+   `endif
+
+   /******************** Generate microprograms in ROM **********************/
+   initial begin
+
+   `ifdef BENCH
+      $display("#  Generating FPMI ROM...");
+   `endif
+      I = 0;
+      fpmi_gen(FPMI_READY | FPMI_EXIT_FLAG);
+
+      // ******************** FLT, FLE, FEQ *********************************
+      `FPMPROG_BEGIN(FPMPROG_CMP);
+      fpmi_gen(FPMI_LOAD_XY);              // X <- A, Y <- B
+      fpmi_gen(FPMI_CMP | FPMI_EXIT_FLAG); // X <- compare(X,Y)
+      `FPMPROG_END(FPMPROG_CMP);
+      
+      // ******************** FADD, FSUB ************************************
+      `FPMPROG_BEGIN(FPMPROG_ADD);
+      fpmi_gen(FPMI_LOAD_XY);               // X <- A, Y <- B
+      fpmi_gen(FPMI_ADD_SWAP);              // if(|X| > |Y|) swap(X,Y) (,sgn)
+      fpmi_gen(FPMI_ADD_SHIFT);             // shift X according to Y exp
+      fpmi_gen(FPMI_ADD_ADD);               // X <- X + Y
+      fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
+      `FPMPROG_END(FPMPROG_ADD);
+      
+      // ******************** FMUL ******************************************
+      `FPMPROG_BEGIN(FPMPROG_MUL);
+      fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
+      `FPMPROG_END(FPMPROG_MUL);
+
+      // ******************** FMADD, FMSUB, FNMADD, FNMSUB ******************
+      `FPMPROG_BEGIN(FPMPROG_MADD);
+      fpmi_gen_fma(FPMI_EXIT_FLAG); // X <- A*B+C (5 cycles)
+      `FPMPROG_END(FPMPROG_MADD);      
+
+      // ******************** FDIV ******************************************
+      // https://en.wikipedia.org/wiki/Division_algorithm
+      // https://stackoverflow.com/questions/24792966/
+      // error-using-newton-raphson-iteration-method-for-
+      // floating-point-division
+      //
+      `FPMPROG_BEGIN(FPMPROG_DIV);      
+      // D' = denominator (rs2) normalized between [0.5,1] (set exp to 126)
+      fpmi_gen(FPMI_FRCP_PROLOG); // D<-A; E<-B; A<-(-D'); B<-32/17; C<-48/17
+      fpmi_gen_fma(0);            // X <- A*B+C (= -D'*32/17 + 48/17)
+      for(iter=0; iter<3; iter=iter+1) begin
+	 if(PRECISE_DIV) begin
+	    // X <- X + X*(1-D'*X)
+	    // (slower more precise iter, but not IEEE754 compliant yet...)
+	    fpmi_gen(FPMI_FRCP_ITER1); // A <- -D'; B <- X; C <- 1.0f
+	    fpmi_gen_fma(0);           // X <- A*B+C (5 cycles)
+	    fpmi_gen(FPMI_FRCP_ITER2); // A <- X; C <- B
+	    fpmi_gen_fma(0);           // X <- A*B+C (5 cycles)
+	 end else begin
+	    //  X <- X * (-X*D' + 2)
+	    // (faster but less precise)
+	    fpmi_gen(FPMI_FRCP_ITER1);  // A <- -D'; B <- X; C <- 2.0f    
+	    fpmi_gen_fma(0);            // X <- A*B+C (5 cycles)
+	    fpmi_gen(FPMI_MV_A_X);      // A <- X
+	    fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B; Y <- C
+	 end
+      end 
+      if(PRECISE_DIV) begin             // round X to nearest
+	 fpmi_gen(FPMI_LOAD_Y_ROUND);
+	 fpmi_gen(FPMI_ADD_ADD);
+	 fpmi_gen(FPMI_ADD_NORM);
+      end      
+      fpmi_gen(FPMI_FRCP_EPILOG); // A <- (E_sign,frcp_exp,X_frac); B <- D
+      if(PRECISE_DIV) begin // error correction
+	 fpmi_gen(FPMI_LOAD_XY_MUL); // X <- A*B
+	 fpmi_gen(FPMI_FDIV_EPILOG); // B <- -E; C <- D; D <- A
+	 fpmi_gen(FPMI_MV_A_X);
+	 fpmi_gen_fma(0);
+	 fpmi_gen(FPMI_MV_C_A);
+	 fpmi_gen(FPMI_MV_B_D);
+	 fpmi_gen(FPMI_MV_A_X);
+	 fpmi_gen_fma(FPMI_EXIT_FLAG);
+      end else begin
+	 fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B
+      end
+      `FPMPROG_END(FPMPROG_DIV);      
+      
+      // ******************** FCVT.W.S, FCVT.WU.S ***************************
+      `FPMPROG_BEGIN(FPMPROG_FP_TO_INT);
+      fpmi_gen(FPMI_LOAD_XY);
+      fpmi_gen(FPMI_FP_TO_INT | FPMI_EXIT_FLAG);
+      `FPMPROG_END(FPMPROG_FP_TO_INT);      
+      
+      // ******************** FCVT.S.W, FCVT.S.WU ***************************
+      `FPMPROG_BEGIN(FPMPROG_INT_TO_FP); // Compute A+0 (use CLZ plugged on X)
+      fpmi_gen(FPMI_INT_TO_FP);                 // X <- 0; Y <- A
+      fpmi_gen(FPMI_ADD_ADD);                   // X <- X + Y
+      fpmi_gen(FPMI_ADD_NORM | FPMI_EXIT_FLAG); // X <- normalize(X)
+      `FPMPROG_END(FPMPROG_INT_TO_FP);
+      
+      // ******************** FSQRT *****************************************
+      // Using Doom's fast inverse square root algorithm:
+      // https://en.wikipedia.org/wiki/Fast_inverse_square_root
+      // http://www.lomont.org/papers/2003/InvSqrt.pdf
+      // TODO: IEEE754-compliant version
+      // See https://t.co/V1SWQ6N6xD?amp=1 (Method of Switching Constants)
+      // See simple effective fast inverse square root with two magic 
+      // constants.
+      //
+      `FPMPROG_BEGIN(FPMPROG_SQRT);
+      // D<-rs1; E,A,B<-(doom_magic - (A >> 1)); C<-3/2
+      fpmi_gen(FPMI_FRSQRT_PROLOG);
+      for(iter=0; iter<2; iter=iter+1) begin
+	 // X <- X * (3/2 - (0.5*rs1*X*X))      	 
+	 fpmi_gen(FPMI_LOAD_XY_MUL);  // X <- A*B; Y <- C
+	 fpmi_gen(FPMI_MV_A_X);       // A <- X
+         fpmi_gen(FPMI_MV_B_NH_D);    // B <- -0.5*|D|
+	 fpmi_gen_fma(0);             // X <- A*B+C
+	 fpmi_gen(FPMI_MV_A_X);       // A <- X
+	 fpmi_gen(FPMI_MV_B_E);       // B <- E
+	 fpmi_gen(FPMI_LOAD_XY_MUL);  // X <- A*B; Y <- C
+	 if(iter==0) begin
+	    fpmi_gen(FPMI_MV_E_X);    // E <- X
+	    fpmi_gen(FPMI_MV_A_X);    // A <- X
+	    fpmi_gen(FPMI_MV_B_E);    // B <- E
+	 end
+      end // X contains 1/sqrt(rs1), now compute rs1*X to get sqrt(rs1)
+      fpmi_gen(FPMI_MV_A_X);                       // A <- X
+      fpmi_gen(FPMI_MV_B_D);                       // B <- D
+      fpmi_gen(FPMI_LOAD_XY_MUL | FPMI_EXIT_FLAG); // X <- A*B; Y <- C
+      `FPMPROG_END(FPMPROG_SQRT);
+      
+      // ******************** FMIN, FMAX ************************************
+      `FPMPROG_BEGIN(FPMPROG_MIN_MAX);
+      fpmi_gen(FPMI_LOAD_XY);
+      fpmi_gen(FPMI_MIN_MAX | FPMI_EXIT_FLAG);
+      `FPMPROG_END(FPMPROG_MIN_MAX);
+      
+`ifdef BENCH      
+      $display("#  FPMI ROM max address:%d",I-1);
+      $display("#  FPMI ROM size       :%d",FPMI_ROM_SIZE);      
+      `ASSERT(I <= FPMI_ROM_SIZE,("!!!!!!! FPMI ROM SIZE exceeded !!!!!!!"));
+`endif      
+   end
+
+`ifndef FPU_EMUL
+
+   // determine microprogram to be called based on decoded instruction
+   reg [6:0] fpmprog;
+   always @(*) begin
+      (* parallel_case, full_case *)
+      case(1'b1)
+	isFLT   | isFLE   | isFEQ               : fpmprog = FPMPROG_CMP[6:0];
+	isFADD  | isFSUB                        : fpmprog = FPMPROG_ADD[6:0];
+	isFMUL                                  : fpmprog = FPMPROG_MUL[6:0];
+	isFMADD | isFMSUB | isFNMADD | isFNMSUB : fpmprog = FPMPROG_MADD[6:0];
+	isFDIV                                  : fpmprog = FPMPROG_DIV[6:0];
+	isFSQRT                                 : fpmprog = FPMPROG_SQRT[6:0];
+	isFCVTWS | isFCVTWUS  : fpmprog = FPMPROG_FP_TO_INT[6:0];
+	isFCVTSW | isFCVTSWU  : fpmprog = FPMPROG_INT_TO_FP[6:0];
+	isFMIN   | isFMAX     : fpmprog = FPMPROG_MIN_MAX[6:0];
+	default               : fpmprog = 0;
+      endcase
+   end
+   
+   // next micro-instruction program counter
+   wire [6:0] fpmi_PC_next = 
+               wr                             ? fpmprog   :
+	       fpmi_instr[FPMI_EXIT_FLAG_bit] ? 0         : 
+                                                fpmi_PC+1 ;
+   always @(posedge clk) begin
+      fpmi_PC <= fpmi_PC_next;
+      fpmi_instr <= fpmi_ROM[fpmi_PC_next];
+   end
+   
+
+   always @(posedge clk) begin
+      if(wr) begin
+         // Denormals are flushed to zero
+         `FP_LD(A, rs1[31], rs1[30:23], (|rs1[30:23]?{1'b1,rs1[22:0]}:24'b0));
+         `FP_LD(B, rs2[31], rs2[30:23], (|rs2[30:23]?{1'b1,rs2[22:0]}:24'b0));
+         `FP_LD(C, rs3[31], rs3[30:23], (|rs3[30:23]?{1'b1,rs3[22:0]}:24'b0));
+
+	 // Backup rs1 in E without flushing to zero (for int2fp instructions)
+         `FP_LD32(E, rs1);	 
+
+         // Single-cycle instructions
+	 (* parallel_case *)
+	 case(1'b1)
+	   isFSGNJ           : `X <= {         rs2[31], rs1[30:0]};
+	   isFSGNJN          : `X <= {        !rs2[31], rs1[30:0]};
+	   isFSGNJX          : `X <= { rs1[31]^rs2[31], rs1[30:0]};
+	   isFCLASS          : `X <= fclass;
+           isFMVXW | isFMVWX : `X <= rs1;
+	 endcase 
+      end else if(busy) begin 
+
+	 // Implementation of the micro-instructions	 
+	 (* parallel_case *)	 
+	 case(1'b1)
+	   // X <- A ; Y <- B
+	   fpmi_is[FPMI_LOAD_XY]: begin
+	      X_sign <= A_sign;
+	      X_frac <= {2'b0, A_frac, 24'd0};
+	      X_exp  <= {1'b0, A_exp}; 
+	      Y_sign <= B_sign ^ isFSUB;
+	      Y_frac <= {2'b0, B_frac, 24'd0};
+	      Y_exp  <= {1'b0, B_exp}; 
+	   end
+
+	   // X <- (+/-) normalize(A*B);  Y <- (+/-)C
+	   fpmi_is[FPMI_LOAD_XY_MUL]: begin
+	      X_sign <= A_sign ^ B_sign ^ (isFNMSUB | isFNMADD);
+	      X_frac <= prod_Z ? 0 :  
+                          (prod_frac[47] ? prod_frac : {prod_frac[48:0],1'b0}); 
+	      X_exp  <= prod_Z ? 0 : prod_exp_norm;
+	      Y_sign <= C_sign ^ (isFMSUB | isFNMADD);
+	      Y_frac <= {2'b0, C_frac, 24'd0};
+	      Y_exp  <= {1'b0, C_exp};
+	   end
+
+	   // if(|X| > |Y|) swap(X,Y)
+	   // if X_sign != Y_sign X <- -X
+	   // We always *add*, but replace X_frac with -X_frac if the
+	   // sign of the operands differ, THEN we shift (signed shift). In
+	   // this way, rounding is correct, even when subtracting a
+	   // low magnitude numner from a large magnitude one.
+	   fpmi_is[FPMI_ADD_SWAP]: begin
+	      if(fabsY_LT_fabsX) begin
+		 X_frac <= (X_sign ^ Y_sign) ? -Y_frac : Y_frac; 
+		 Y_frac <= X_frac;
+		 X_exp  <= Y_exp;  Y_exp  <= X_exp;
+		 X_sign <= Y_sign; Y_sign <= X_sign;
+	      end else if(X_sign ^ Y_sign) begin
+		 X_frac <= -X_frac;
+	      end
+	   end
+
+	   // shift A in order to make it match B exponent
+	   fpmi_is[FPMI_ADD_SHIFT]: begin
+	      `ASSERT(!fabsY_LT_fabsX, ("ADD_SHIFT: incorrect order"));
+	      X_frac <= X_frac >>> exp_diff; // note the signed shift !
+	      X_exp <= Y_exp;
+	   end
+
+	   // A <- A (+/-) B
+	   fpmi_is[FPMI_ADD_ADD]: begin
+	      X_frac      <= frac_sum[49:0];
+	      X_sign      <= Y_sign;
+	      // normalization left shamt = 47 - first_bit_set = clz - 16
+	      norm_lshamt <= frac_sum_clz - 16;
+	      // Exponent of X once normalized = X_exp + first_bit_set - 47
+	      //                 = X_exp + 63 - clz - 47 = X_exp + 16 - clz
+	      X_exp_norm <= X_exp + 16 - {3'b000,frac_sum_clz};
+	   end
+
+	   // X <- normalize(X) (after ADD_ADD -> norm_lshamt and A_exp_norm)
+	   fpmi_is[FPMI_ADD_NORM]: begin
+	      if(X_exp_norm <= 0 || (X_frac == 0)) begin
+		 X_frac <= 0;
+		 X_exp <= 0;
+	      end else begin
+		 X_frac <= X_frac[48] ? (X_frac >> 1) : X_frac << norm_lshamt;
+		 X_exp  <= X_exp_norm;
+	      end
+	   end
+
+	   fpmi_is[FPMI_LOAD_Y_ROUND]: begin
+	      Y_sign <= X_sign;
+	      Y_exp  <= X_exp;
+	      Y_frac <= X_frac[23] ? (1 << 24) : 50'd0; 
+	   end
+	   
+	   // X <- result of comparison between X and Y
+	   fpmi_is[FPMI_CMP]: begin
+	      `X <= { 31'b0, 
+			    isFLT && X_LT_Y || 
+			    isFLE && X_LE_Y || 
+			    isFEQ && X_EQ_Y
+                          };
+	   end
+
+	   fpmi_is[FPMI_MV_B_D] : `FP_MV(B,D);
+	   fpmi_is[FPMI_MV_B_E] : `FP_MV(B,E);
+	   fpmi_is[FPMI_MV_A_X] : `FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]);
+	   fpmi_is[FPMI_MV_C_A] : `FP_MV(C,A);
+	   fpmi_is[FPMI_MV_E_X] : `FP_LD(E,X_sign,X_exp[7:0],X_frac[47:24]);
+	   
+	   // B <= -|D| / 2.0
+	   fpmi_is[FPMI_MV_B_NH_D]: 
+	                {B_sign, B_exp, B_frac} <= {1'b1,D_exp-8'd1,D_frac};
+
+	   fpmi_is[FPMI_FRCP_PROLOG]: begin
+	      `FP_MV(D,A);
+	      `FP_MV(E,B);
+	       // A <= -D', that is, -(B normalized in [0.5,1])	      
+	      `FP_LD(A,1'b1,8'd126, B_frac); 
+	      `FP_LD32(B, 32'h3FF0F0F1); // 32/17
+	      `FP_LD32(C, 32'h4034B4B5); // 48/17
+	   end
+	   
+	   fpmi_is[FPMI_FRCP_ITER1]: begin
+	      `FP_LD(A,1'b1,8'd126, E_frac);             // A <= -D'
+	      `FP_LD(B,X_sign,X_exp[7:0],X_frac[47:24]); // B <= X
+	       //                           1.0            2.0
+	      `FP_LD32(C, PRECISE_DIV ? 32'h3f800000 : 32'h40000000); 
+	   end
+
+	   // This one is used only if PRECISE_DIV is set
+	   fpmi_is[FPMI_FRCP_ITER2]: begin
+	      `FP_LD(A,X_sign,X_exp[7:0],X_frac[47:24]); // A <= X
+	      `FP_MV(C,B);
+	   end
+	   
+	   fpmi_is[FPMI_FRCP_EPILOG]: begin
+	      `FP_LD(A,E_sign,frcp_exp[7:0],X_frac[47:24]);
+	      `FP_MV(B,D);
+	   end
+
+	   // This one is used only if PRECISE_DIV is set
+	   fpmi_is[FPMI_FDIV_EPILOG]: begin
+	      `FP_LD(B,!E_sign, E_exp, E_frac); // B <= -E
+	      `FP_MV(C,D);
+	      `FP_MV(D,A);
+	   end
+	   
+	   fpmi_is[FPMI_FRSQRT_PROLOG]: begin
+	      `FP_LD32(D, rs1);
+	      `FP_LD32(E, rsqrt_doom_magic);
+	      `FP_LD32(A, rsqrt_doom_magic);
+	      `FP_LD32(B, rsqrt_doom_magic);
+	      `FP_LD32(C, 32'h3fc00000); // 1.5
+	   end
+	   
+	   fpmi_is[FPMI_FP_TO_INT]: begin
+	      // TODO: check overflow
+	      `X <= 
+               (isFCVTWUS | !X_sign) ? X_fcvt_ftoi_shifted 
+                                     : -$signed(X_fcvt_ftoi_shifted);
+	   end
+
+	   fpmi_is[FPMI_INT_TO_FP]: begin
+	      // TODO: rounding
+	      // We do a fake addition with zero, to prepare normalization
+	      // (uses CLZ plugged on the adder).
+	      X_frac <= 0;
+	      // 127+23: standard exponent bias
+	      // +6 because it is bit 29 of rs1 that overwrites 
+	      //    bit 47 of A_frac, instead of bit 23 (and 29-23 = 6).
+	      X_exp  <= 127+23+6;
+	      Y_frac <= 
+	         (isFCVTSWU | !E_sign) ? {E_sign, E_exp, E_frac[22:0], 18'd0}
+                           : {-$signed({E_sign, E_exp, E_frac[22:0]}), 18'd0};
+	      Y_sign <= isFCVTSW & E_sign;
+	   end 
+	   
+	   fpmi_is[FPMI_MIN_MAX]: begin
+	      `X <=  (X_LT_Y ^ isFMAX)
+		                 ? {X_sign, X_exp[7:0], X_frac[46:24]}
+	 	                 : {Y_sign, Y_exp[7:0], Y_frac[46:24]};
+	   end
+	 endcase 
+      end
+   end
+`endif   
+
+   // Some circuitry used by the FPU micro-instructions:
+
+   // ******************* Comparisons ******************************************
+   // Exponent adder
+   wire signed [8:0]  exp_sum   = Y_exp + X_exp;
+   wire signed [8:0]  exp_diff  = Y_exp - X_exp;
+   
+   wire expX_EQ_expY   = (exp_diff  == 0);
+   wire fracX_EQ_fracY = (frac_diff == 0);
+   wire fabsX_EQ_fabsY = (expX_EQ_expY && fracX_EQ_fracY);
+   wire fabsX_LT_fabsY = (!exp_diff[8] && !expX_EQ_expY) || 
+                           (expX_EQ_expY && !fracX_EQ_fracY && !frac_diff[50]);
+
+   wire fabsX_LE_fabsY = (!exp_diff[8] && !expX_EQ_expY) || 
+                                              (expX_EQ_expY && !frac_diff[50]);
+   
+   wire fabsY_LT_fabsX = exp_diff[8] || (expX_EQ_expY && frac_diff[50]);
+
+   wire fabsY_LE_fabsX = exp_diff[8] || 
+                           (expX_EQ_expY && (frac_diff[50] || fracX_EQ_fracY));
+
+   wire X_LT_Y = X_sign && !Y_sign ||
+	         X_sign &&  Y_sign && fabsY_LT_fabsX ||
+ 		!X_sign && !Y_sign && fabsX_LT_fabsY ;
+
+   wire X_LE_Y = X_sign && !Y_sign ||
+		 X_sign &&  Y_sign && fabsY_LE_fabsX ||
+ 	        !X_sign && !Y_sign && fabsX_LE_fabsY ;
+   
+   wire X_EQ_Y = fabsX_EQ_fabsY && (X_sign == Y_sign);
+
+   // ****************** Addition, subtraction *********************************
+   wire signed [50:0] frac_sum  = Y_frac + X_frac;
+   wire signed [50:0] frac_diff = Y_frac - X_frac;
+
+   // ****************** Product ***********************************************
+   wire [49:0] prod_frac = A_frac * B_frac; // TODO: check overflows
+
+   // exponent of product, once normalized
+   // (obtained by writing expression of product and inspecting exponent)
+   // Two cases: first bit set = 47 or 46 (only possible cases with normals)
+   wire signed [8:0] prod_exp_norm = A_exp+B_exp-127+{7'b0,prod_frac[47]};
+
+   // detect null product and underflows (all denormals are flushed to zero)
+   wire prod_Z = (prod_exp_norm <= 0) || !(|prod_frac[47:46]);
+
+   // ****************** Normalization *****************************************
+   // Count leading zeroes in A+B
+   // Note1: CLZ only work with power of two width (hence 13'b0 padding).
+   // Note2: first bit set = 63 - CLZ (of course !)
+   wire [5:0] 	              frac_sum_clz;
+   CLZ clz2({13'b0,frac_sum}, frac_sum_clz);
+   reg [5:0] 		      norm_lshamt; // shift amount for ADD normalization
+
+   // Exponent of A once normalized = X_exp + first_bit_set - 47
+   //                               = X_exp + 63 - clz - 47 = X_exp + 16 - clz
+   // X_exp_norm <= X_exp + 16 - {3'b000,A_clz};
+   reg signed [8:0] X_exp_norm;
+
+   // ****************** Reciprocal (1/x), used by FDIV ************************
+   // Exponent for reciprocal (1/x)
+   // Initial value of x kept in E.
+   wire signed [8:0]  frcp_exp  = 9'd126 + X_exp - $signed({1'b0, E_exp});
+
+   // ****************** Reciprocal square root (1/sqrt(x)) ********************
+   // https://en.wikipedia.org/wiki/Fast_inverse_square_root
+   wire [31:0] rsqrt_doom_magic = 32'h5f3759df - {1'b0,A_exp, A_frac[22:1]};
+
+   // ****************** Float to Integer conversion ***************************
+   // -127-23 is standard exponent bias
+   // -6 because it is bit 29 of X that corresponds to bit 47 of X_frac,
+   //    instead of bit 23 (and 23-29 = -6).
+   wire signed [8:0]  fcvt_ftoi_shift = A_exp - 9'd127 - 9'd23 - 9'd6; 
+   wire signed [8:0]  neg_fcvt_ftoi_shift = -fcvt_ftoi_shift;
+   
+   wire [31:0] 	X_fcvt_ftoi_shifted =  fcvt_ftoi_shift[8] ? // R or L shift
+                        (|neg_fcvt_ftoi_shift[8:5]  ?  0 :  // underflow
+                     ({X_frac[49:18]} >> neg_fcvt_ftoi_shift[4:0])) : 
+                     ({X_frac[49:18]} << fcvt_ftoi_shift[4:0]);
+   
+   // ******************* Classification ***************************************
+
+   wire rs1_exp_Z   = (rs1[30:23] == 0  );
+   wire rs1_exp_255 = (rs1[30:23] == 255);
+   wire rs1_frac_Z  = (rs1[22:0]  == 0  );
+   
+   wire [31:0] fclass = {
+      22'b0,				    
+      rs1_exp_255 &  rs1[22],                         // 9: quiet NaN
+      rs1_exp_255 & !rs1[22] & (|rs1[21:0]),          // 8: sig   NaN
+              !rs1[31] &  rs1_exp_255 & rs1_frac_Z,   // 7: +infinity
+              !rs1[31] & !rs1_exp_Z   & !rs1_exp_255, // 6: +normal
+              !rs1[31] &  rs1_exp_Z   & !rs1_frac_Z,  // 5: +subnormal
+              !rs1[31] &  rs1_exp_Z   & rs1_frac_Z,   // 4: +0  
+               rs1[31] &  rs1_exp_Z   & rs1_frac_Z,   // 3: -0
+               rs1[31] &  rs1_exp_Z   & !rs1_frac_Z,  // 2: -subnormal
+               rs1[31] & !rs1_exp_Z   & !rs1_exp_255, // 1: -normal
+               rs1[31] &  rs1_exp_255 & rs1_frac_Z    // 0: -infinity
+   };
+
+   /************************************************************************/
+   
+   // RV32F instruction decoder
+   // See table p133 (RV32G instruction listings)
+   // Notes:
+   //  - FLW/FSW handled by LOAD/STORE in femtorv32 (instr[2] set if FLW/FSW)
+   //  - For all other F instructions, instr[6:5] == 2'b10
+   //  - FMADD/FMSUB/FNMADD/FNMSUB: instr[4] = 1'b0
+   //  - For all remaining F instructions, instr[4] = 1'b1
+   //  - FMV.X.W and FCLASS have same funct7 (7'b1110000),
+   //      (discriminated by instr[12])
+   //  - there is a big gotcha in the official doc for RV32F:
+   //        the doc says FNMADD computes -rs1*rs2-rs3
+   //          (yes, with *minus* rs3)
+   //        it should have said FNMADD computes -(rs1*rs2+rs3)
+   //                        and FNMSUB compures -(rs1*rs2-rs3)
+   //        they probably did not put the parentheses because when
+   //        you implement it, you change the sign of rs1 and rs3 according
+   //        to the operation rather than the sign of the whole result
+   //        (here, it is done by the FPMI_LOAD_XY_MUL micro instruction).
+
+   reg isFMADD, isFMSUB, isFNMSUB, isFNMADD;
+   reg isFADD, isFSUB, isFMUL, isFDIV, isFSQRT;
+   reg isFSGNJ, isFSGNJN, isFSGNJX;
+   reg isFMIN, isFMAX;
+   reg isFEQ, isFLT, isFLE;
+   reg isFCLASS, isFCVTWS, isFCVTWUS;
+   reg isFCVTSW, isFCVTSWU;
+   reg isFMVXW, isFMVWX;
+
+   always @(*) begin
+      isFMADD   = (instr[4:2] == 3'b000); // rd <-   rs1*rs2+rs3
+      isFMSUB   = (instr[4:2] == 3'b001); // rd <-   rs1*rs2-rs3
+      isFNMSUB  = (instr[4:2] == 3'b010); // rd <- -(rs1*rs2-rs3) 
+      isFNMADD  = (instr[4:2] == 3'b011); // rd <- -(rs1*rs2+rs3) 
+
+      isFADD    = (instr[4] && (instr[31:27] == 5'b00000));
+      isFSUB    = (instr[4] && (instr[31:27] == 5'b00001));
+      isFMUL    = (instr[4] && (instr[31:27] == 5'b00010));
+      isFDIV    = (instr[4] && (instr[31:27] == 5'b00011));
+      isFSQRT   = (instr[4] && (instr[31:27] == 5'b01011));   
+
+      isFSGNJ  = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b00));
+      isFSGNJN = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b01));
+      isFSGNJX = (instr[4] && (instr[31:27]==5'b00100)&&(instr[13:12]==2'b10));
+
+      isFMIN    = (instr[4] && (instr[31:27] == 5'b00101) && !instr[12]);
+      isFMAX    = (instr[4] && (instr[31:27] == 5'b00101) &&  instr[12]);
+
+      isFEQ =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b10));
+      isFLT =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b01));
+      isFLE =(instr[4] && (instr[31:27]==5'b10100) && (instr[13:12] == 2'b00));
+   
+      isFCLASS  = (instr[4] && (instr[31:27] == 5'b11100) &&  instr[12]); 
+   
+      isFCVTWS  = (instr[4] && (instr[31:27] == 5'b11000) && !instr[20]);
+      isFCVTWUS = (instr[4] && (instr[31:27] == 5'b11000) &&  instr[20]);
+      
+      isFCVTSW  = (instr[4] && (instr[31:27] == 5'b11010) && !instr[20]);
+      isFCVTSWU = (instr[4] && (instr[31:27] == 5'b11010) &&  instr[20]);
+      
+      isFMVXW   = (instr[4] && (instr[31:27] == 5'b11100) && !instr[12]);
+      isFMVWX   = (instr[4] && (instr[31:27] == 5'b11110));
+   end
+
+`ifdef FPU_EMUL
+ `define FPU_EMUL1(op) `X <= $c32(op,"(",rs1,")")
+ `define FPU_EMUL2(op) `X <= $c32(op,"(",rs1,",",rs2,")")
+ `define FPU_EMUL3(op) `X <= $c32(op,"(",rs1,",",rs2,",",rs3,")")
+   always @(posedge clk) begin
+      if(wr) begin
+	 (* parallel_case *)
+	 case(1'b1)
+	   isFMUL   : `FPU_EMUL2("FMUL");
+	   isFADD   : `FPU_EMUL2("FADD");
+	   isFSUB   : `FPU_EMUL2("FSUB");
+	   isFDIV   : `FPU_EMUL2("FDIV");
+	   isFSQRT  : `FPU_EMUL1("FSQRT");	   
+	   isFMADD  : `FPU_EMUL3("FMADD");	  
+	   isFMSUB  : `FPU_EMUL3("FMSUB");	  
+	   isFNMADD : `FPU_EMUL3("FNMADD");	  
+	   isFNMSUB : `FPU_EMUL3("FNMSUB");	  
+	   isFEQ    : `FPU_EMUL2("FEQ");
+	   isFLT    : `FPU_EMUL2("FLT");
+	   isFLE    : `FPU_EMUL2("FLE");
+	   isFCVTWS : `FPU_EMUL1("FCVTWS"); 
+	   isFCVTWUS: `FPU_EMUL1("FCVTWUS");
+	   isFCVTSW : `FPU_EMUL1("FCVTSW"); 
+	   isFCVTSWU: `FPU_EMUL1("FCVTSWU"); 
+	   isFMIN   : `FPU_EMUL2("FMIN");
+	   isFMAX   : `FPU_EMUL2("FMAX");
+	   isFCLASS : `FPU_EMUL1("FCLASS");
+	   isFSGNJ  : `FPU_EMUL2("FSGNJ");
+	   isFSGNJN : `FPU_EMUL2("FSGNJN");
+	   isFSGNJX : `FPU_EMUL2("FSGNJX");
+           isFMVXW | isFMVWX : `X <= rs1;
+         endcase		     
+      end		     
+   end
+`endif
+
+/****************************************************************************/
+// When doing simulations, compare the result of all operations with
+// what's computed on the host CPU. 
+// Note: my FDIV and FSQRT are not IEEE754 compliant (yet) ! 
+// (checks commented-out for now)
+
+`ifdef NRV_FEMTORV32_PETITBATEAU // makes sure we are in the learn-FPGA fmwk
+`ifdef VERILATOR   
+
+ `define FPU_CHECK1(op) \
+       z <= $c32("CHECK_",op,"(",`X,",",rs1,")")
+ `define FPU_CHECK2(op) \
+       z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,")")
+ `define FPU_CHECK3(op) \
+       z <= $c32("CHECK_",op,"(",`X,",",rs1,",",rs2,",",rs3,")")
+   
+   reg [31:0] z;
+   reg 	      active;
+   
+   always @(posedge clk) begin
+      
+      if(wr) begin
+	 active  <= 1'b1;
+      end
+      
+      if(active && !busy) begin
+	 active <= 1'b0;
+	 case(1'b1)
+	   isFMUL :   `FPU_CHECK2("FMUL");
+	   isFADD :   `FPU_CHECK2("FADD");
+	   isFSUB :   `FPU_CHECK2("FSUB");
+	   isFDIV :   `FPU_CHECK2("FDIV");  
+	   // isFSQRT:   `FPU_CHECK1("FSQRT"); // yes I know, not IEEE754 yet
+	   isFMADD:   `FPU_CHECK3("FMADD");	  
+	   isFMSUB:   `FPU_CHECK3("FMSUB");	  
+	   isFNMADD:  `FPU_CHECK3("FNMADD");	  
+	   isFNMSUB:  `FPU_CHECK3("FNMSUB");	  
+	   isFEQ:     `FPU_CHECK2("FEQ");
+	   isFLT:     `FPU_CHECK2("FLT");
+	   isFLE:     `FPU_CHECK2("FLE");
+	   isFCVTWS : `FPU_CHECK1("FCVTWS"); 
+	   isFCVTWUS: `FPU_CHECK1("FCVTWUS");
+	   isFCVTSW : `FPU_CHECK1("FCVTSW"); 
+	   isFCVTSWU: `FPU_CHECK1("FCVTSWU"); 
+	   isFMIN:    `FPU_CHECK2("FMIN");
+	   isFMAX:    `FPU_CHECK2("FMAX");
+	 endcase
+      end
+   end 
+
+`endif
+`endif
+
+endmodule   
+   
+/**********************************************************************/
+
+// FPU Normalization needs to detect the position of the first bit set 
+// in the A_frac register. It is easier to count the number of leading 
+// zeroes (CLZ for Count Leading Zeroes), as follows. See:
+// https://electronics.stackexchange.com/questions/196914/
+//    verilog-synthesize-high-speed-leading-zero-count
+// TODO: test also Dean Gaudet's algorithm (see Hackers Delights p. 110)
+module CLZ #(
+   parameter W_IN = 64, // must be power of 2, >= 2
+   parameter W_OUT = $clog2(W_IN)	     
+) (
+   input wire [W_IN-1:0]   in,
+   output wire [W_OUT-1:0] out
+);
+  generate
+     if(W_IN == 2) begin
+	assign out = !in[1];
+     end else begin
+	wire [W_OUT-2:0] half_count;
+	wire [W_IN/2-1:0] lhs = in[W_IN/2 +: W_IN/2];
+	wire [W_IN/2-1:0] rhs = in[0      +: W_IN/2];
+	wire left_empty = ~|lhs;
+	CLZ #(
+	  .W_IN(W_IN/2)
+        ) inner(
+           .in(left_empty ? rhs : lhs),
+           .out(half_count)		
+	);
+	assign out = {left_empty, half_count};
+     end
+  endgenerate
+endmodule   
+
+`endif
@@ -0,0 +1,22 @@
+/********************* Utilities, macros for debugging *************/
+
+`ifdef VERBOSE
+  `define verbose(command) command
+`else
+  `define verbose(command)
+`endif
+
+`ifdef BENCH
+ `define BENCH_OR_LINT
+ `ifdef QUIET
+  `define bench(command) 
+ `else
+  `define bench(command) command
+ `endif
+`else
+  `define bench(command)
+`endif
+
+`ifdef verilator
+ `define BENCH_OR_LINT
+`endif