initial commit

2025-08-02 06:09:31 +03:00
commit 00015ffc03
85 changed files with 62051 additions and 0 deletions
--- a/BOARDS/arty.xdc
+++ b/BOARDS/arty.xdc
@@ -0,0 +1,27 @@
+# Clock pin
+set_property PACKAGE_PIN E3 [get_ports CLK]
+set_property IOSTANDARD LVCMOS33 [get_ports CLK]
+
+# LEDs
+set_property PACKAGE_PIN H5  [get_ports LEDS[0]]
+set_property PACKAGE_PIN J5  [get_ports LEDS[1]]
+set_property PACKAGE_PIN T9  [get_ports LEDS[2]]
+set_property PACKAGE_PIN T10 [get_ports LEDS[3]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[0]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[1]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[2]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[3]]
+
+# Clock constraints
+create_clock -period 10.0 [get_ports CLK]
+
+# UART
+set_property LOC D10 [get_ports TXD]
+set_property LOC A9 [get_ports RXD]
+set_property IOSTANDARD LVCMOS33 [get_ports RXD]
+set_property IOSTANDARD LVCMOS33 [get_ports TXD]
+
+# reset button
+set_property LOC C2 [get_ports RESET]
+set_property IOSTANDARD LVCMOS33 [get_ports RESET]
+
--- a/BOARDS/cmod_a7.xdc
+++ b/BOARDS/cmod_a7.xdc
@@ -0,0 +1,29 @@
+# Clock pin
+set_property PACKAGE_PIN L17 [get_ports CLK]
+set_property IOSTANDARD LVCMOS33 [get_ports CLK]
+
+# LEDs
+set_property PACKAGE_PIN A17 [get_ports LEDS[0]]
+set_property PACKAGE_PIN C16 [get_ports LEDS[1]]
+set_property PACKAGE_PIN B17 [get_ports LEDS[2]]
+set_property PACKAGE_PIN B16 [get_ports LEDS[3]]
+set_property PACKAGE_PIN C17 [get_ports LEDS[4]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[0]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[1]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[2]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[3]]
+set_property IOSTANDARD LVCMOS33 [get_ports LEDS[4]]
+
+# Clock constraints
+create_clock -period 83.33 [get_ports CLK]
+
+# UART
+set_property LOC G17 [get_ports TXD]
+set_property LOC G19 [get_ports RXD]
+set_property IOSTANDARD LVCMOS33 [get_ports RXD]
+set_property IOSTANDARD LVCMOS33 [get_ports TXD]
+
+# reset button
+set_property LOC A18 [get_ports RESET]
+set_property IOSTANDARD LVCMOS33 [get_ports RESET]
+
--- a/BOARDS/ecp5_evn.lpf
+++ b/BOARDS/ecp5_evn.lpf
@@ -0,0 +1,35 @@
+# See https://github.com/emard/ulx3s/blob/master/doc/constraints/ulx3s_v20.lpf
+
+## Clock #########################################
+
+LOCATE COMP "CLK" SITE "A10";
+IOBUF PORT "CLK" IO_TYPE=LVCMOS33;
+FREQUENCY PORT "CLK" 12 MHZ;
+
+## RESET button ##################################
+
+LOCATE COMP "RESET" SITE "P4"; 
+IOBUF PORT "RESET" IO_TYPE=LVCMOS33;
+
+## LEDs ##########################################
+
+LOCATE COMP "LEDS[0]" SITE "B17";
+LOCATE COMP "LEDS[1]" SITE "A17";
+LOCATE COMP "LEDS[2]" SITE "C17";
+LOCATE COMP "LEDS[3]" SITE "B18";
+LOCATE COMP "LEDS[4]" SITE "A18";
+
+IOBUF PORT "LEDS[0]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[1]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[2]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[3]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[4]"  IO_TYPE=LVCMOS33;
+
+## UART ######################################################
+
+LOCATE COMP "TXD"   SITE "D11"; 
+LOCATE COMP "RXD"   SITE "D12"; 
+
+IOBUF PORT "TXD" PULLMODE=UP IO_TYPE=LVCMOS33 DRIVE=4;
+IOBUF PORT "RXD" PULLMODE=UP IO_TYPE=LVCMOS33;
+
--- a/BOARDS/icebreaker.pcf
+++ b/BOARDS/icebreaker.pcf
@@ -0,0 +1,13 @@
+set_io CLK 35
+
+set_io LEDS[0] 27
+set_io LEDS[1] 21
+set_io LEDS[2] 25
+set_io LEDS[3] 23
+set_io LEDS[4] 26
+
+set_io TXD 9
+set_io RXD 6
+
+set_io RESET 10
+
--- a/BOARDS/icestick.pcf
+++ b/BOARDS/icestick.pcf
@@ -0,0 +1,21 @@
+set_io CLK 21
+
+set_io LEDS[0] 99
+set_io LEDS[1] 98
+set_io LEDS[2] 97
+set_io LEDS[3] 96
+set_io LEDS[4] 95
+
+set_io TXD  8
+set_io RXD  9
+
+set_io SPIFLASH_CLK  70
+set_io SPIFLASH_CS_N 71
+
+set_io SPIFLASH_MOSI 67
+set_io SPIFLASH_MISO 68
+
+set_io SPIFLASH_IO[0] 67
+set_io SPIFLASH_IO[1] 68
+
+set_io RESET 47
--- a/BOARDS/run_arty.sh
+++ b/BOARDS/run_arty.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+PROJECT_NAME=SOC
+DB_DIR=/usr/share/nextpnr/prjxray-db
+CHIPDB_DIR=/usr/share/nextpnr/xilinx-chipdb
+PART=xc7a35tcsg324-1
+VERILOGS=$1
+BOARD_FREQ=100
+CPU_FREQ=100
+
+set -ex
+yosys -DARTY -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "scratchpad -set xilinx_dsp.multonly 1" -p "synth_xilinx -nowidelut -flatten -abc9 -arch xc7 -top SOC; write_json ${PROJECT_NAME}.json" ${VERILOGS}
+nextpnr-xilinx --chipdb ${CHIPDB_DIR}/xc7a35t.bin --xdc BOARDS/arty.xdc --json ${PROJECT_NAME}.json --write ${PROJECT_NAME}_routed.json --fasm ${PROJECT_NAME}.fasm
+fasm2frames --part ${PART} --db-root ${DB_DIR}/artix7 ${PROJECT_NAME}.fasm > ${PROJECT_NAME}.frames
+xc7frames2bit --part_file ${DB_DIR}/artix7/${PART}/part.yaml --part_name ${PART} --frm_file ${PROJECT_NAME}.frames --output_file ${PROJECT_NAME}.bit
+#To send to SRAM:
+openFPGALoader --board arty ${PROJECT_NAME}.bit
+#To send to FLASH: 
+#openFPGALoader --board arty -f ${PROJECT_NAME}.bit
--- a/BOARDS/run_cmod_a7.sh
+++ b/BOARDS/run_cmod_a7.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+PROJECT_NAME=SOC
+DB_DIR=/usr/share/nextpnr/prjxray-db
+CHIPDB_DIR=/usr/share/nextpnr/xilinx-chipdb
+PART=xc7a35tcpg236-1
+VERILOGS=$1
+BOARD_FREQ=100
+CPU_FREQ=100
+
+set -ex
+yosys -DCMODA7 -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "scratchpad -set xilinx_dsp.multonly 1" -p "synth_xilinx -nowidelut -flatten -abc9 -arch xc7 -top SOC; write_json ${PROJECT_NAME}.json" ${VERILOGS}
+nextpnr-xilinx --chipdb ${CHIPDB_DIR}/xc7a35tcpg236-1.bin --xdc BOARDS/cmod_a7.xdc --json ${PROJECT_NAME}.json --write ${PROJECT_NAME}_routed.json --fasm ${PROJECT_NAME}.fasm
+fasm2frames --part ${PART} --db-root ${DB_DIR}/artix7 ${PROJECT_NAME}.fasm > ${PROJECT_NAME}.frames
+xc7frames2bit --part_file ${DB_DIR}/artix7/${PART}/part.yaml --part_name ${PART} --frm_file ${PROJECT_NAME}.frames --output_file ${PROJECT_NAME}.bit
+#To send to SRAM:
+openFPGALoader --freq 30e6 -c digilent --fpga-part xc7a35 femtosoc.bit
+#To send to FLASH: 
+# openFPGALoader --freq 30e6 -c digilent --fpga-part xc7a35tcpg236 -f femtosoc.bit
--- a/BOARDS/run_ecp5evn.sh
+++ b/BOARDS/run_ecp5evn.sh
@@ -0,0 +1,13 @@
+PROJECTNAME=SOC
+BOARD=ecp5_evn
+BOARD_FREQ=12
+CPU_FREQ=100
+FPGA_VARIANT=um5g-85k
+FPGA_PACKAGE=CABGA381
+VERILOGS=$1
+
+yosys -q -DECP5_EVN -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "synth_ecp5 -abc9 -top $PROJECTNAME -json $PROJECTNAME.json" $VERILOGS  || exit
+nextpnr-ecp5 --force --timing-allow-fail --json $PROJECTNAME.json --lpf BOARDS/$BOARD.lpf --textcfg $PROJECTNAME"_out".config --freq $BOARD_FREQ --$FPGA_VARIANT --package $FPGA_PACKAGE || exit
+ecppack --compress --svf-rowsize 100000 --svf $PROJECTNAME".svf" $PROJECTNAME"_out.config" $PROJECTNAME".bit" || exit
+ujprog -j FLASH $PROJECTNAME".bit"  || exit
+
--- a/BOARDS/run_gowin.sh
+++ b/BOARDS/run_gowin.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# --- CONFIGURATION ---
+PROJECTNAME=SOC
+DEVICE='GW2A-LV18PG256C8/I7'
+BOARD='tangprimer20k'
+BOARD_FREQ=27
+CPU_FREQ=50
+VERILOGS=$1
+
+# --- Synthesis with Yosys ---
+yosys -q -DPRIMER20K -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -D INV_BTN=0 -p "
+    read_verilog $VERILOGS;
+    synth_gowin -top $PROJECTNAME -json $PROJECTNAME.json -family gw2a" || exit 1
+
+# --- Placement and Routing with nextpnr-himbaechel ---
+nextpnr-himbaechel \
+    --json $PROJECTNAME.json \
+    --write $PROJECTNAME"_pnr.json" \
+    --device $DEVICE \
+    --vopt cst=BOARDS/$BOARD.cst \
+    --vopt family=GW2A-18 \
+    --freq $BOARD_FREQ || exit 1
+
+# --- Bitstream Packing with gowin_pack ---
+gowin_pack -d $DEVICE -o $PROJECTNAME.fs $PROJECTNAME"_pnr.json" || exit 1
+
+# --- Programming with openFPGALoader ---
+openFPGALoader -b tangprimer20k $PROJECTNAME.fs || exit 1
+
--- a/BOARDS/run_icebreaker.sh
+++ b/BOARDS/run_icebreaker.sh
@@ -0,0 +1,14 @@
+PROJECTNAME=SOC
+BOARD=icebreaker
+BOARD_FREQ=12
+CPU_FREQ=20
+FPGA_VARIANT=up5k
+FPGA_PACKAGE=sg48
+VERILOGS=$1
+yosys -q -DICE_BREAKER -DNEGATIVE_RESET -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "synth_ice40 -abc9 -device u -dsp -top $PROJECTNAME -json $PROJECTNAME.json" $VERILOGS  || exit
+nextpnr-ice40 --force --json $PROJECTNAME.json --pcf BOARDS/$BOARD.pcf --asc $PROJECTNAME.asc --freq $BOARD_FREQ --$FPGA_VARIANT --package $FPGA_PACKAGE --pcf-allow-unconstrained || exit
+icetime -p BOARDS/$BOARD.pcf -P $FPGA_PACKAGE -r $PROJECTNAME.timings -d up5k -t $PROJECTNAME.asc
+icepack $PROJECTNAME.asc $PROJECTNAME.bin || exit
+iceprog $PROJECTNAME.bin || exit
+echo DONE.
+
--- a/BOARDS/run_icestick.sh
+++ b/BOARDS/run_icestick.sh
@@ -0,0 +1,14 @@
+PROJECTNAME=SOC
+BOARD=icestick
+BOARD_FREQ=12
+CPU_FREQ=45
+FPGA_VARIANT=hx1k
+FPGA_PACKAGE=tq144
+VERILOGS=$1
+yosys -q -DICE_STICK -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "synth_ice40 -relut -top $PROJECTNAME -json $PROJECTNAME.json" $VERILOGS  || exit
+nextpnr-ice40 --force --timing-allow-fail --json $PROJECTNAME.json --pcf BOARDS/$BOARD.pcf --asc $PROJECTNAME.asc --freq $CPU_FREQ --$FPGA_VARIANT --package $FPGA_PACKAGE --pcf-allow-unconstrained --opt-timing || exit
+icetime -p BOARDS/$BOARD.pcf -P $FPGA_PACKAGE -r $PROJECTNAME.timings -d hx1k -t $PROJECTNAME.asc
+icepack $PROJECTNAME.asc $PROJECTNAME.bin || exit
+iceprog $PROJECTNAME.bin || exit
+echo DONE.
+
--- a/BOARDS/run_icestick_show.sh
+++ b/BOARDS/run_icestick_show.sh
@@ -0,0 +1,9 @@
+PROJECTNAME=SOC
+BOARD=icestick
+BOARD_FREQ=12
+CPU_FREQ=45
+FPGA_VARIANT=hx1k
+FPGA_PACKAGE=tq144
+VERILOGS=$1
+yosys -q -DICE_STICK -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "synth_ice40 -relut -top $PROJECTNAME -json $PROJECTNAME.json" $VERILOGS  || exit
+nextpnr-ice40 --gui --force --timing-allow-fail --json $PROJECTNAME.json --pcf BOARDS/$BOARD.pcf --asc $PROJECTNAME.asc --freq $CPU_FREQ --$FPGA_VARIANT --package $FPGA_PACKAGE --pcf-allow-unconstrained --opt-timing || exit
--- a/BOARDS/run_ulx3s.sh
+++ b/BOARDS/run_ulx3s.sh
@@ -0,0 +1,13 @@
+PROJECTNAME=SOC
+BOARD=ulx3s
+BOARD_FREQ=25
+CPU_FREQ=100
+FPGA_VARIANT=85k
+FPGA_PACKAGE=CABGA381
+VERILOGS=$1
+
+yosys -q -DULX3S -DBOARD_FREQ=$BOARD_FREQ -DCPU_FREQ=$CPU_FREQ -p "synth_ecp5 -abc9 -top $PROJECTNAME -json $PROJECTNAME.json" $VERILOGS  || exit
+nextpnr-ecp5 --force --timing-allow-fail --json $PROJECTNAME.json --lpf BOARDS/$BOARD.lpf --textcfg $PROJECTNAME"_out".config --freq $BOARD_FREQ --$FPGA_VARIANT --package $FPGA_PACKAGE || exit
+ecppack --compress --svf-rowsize 100000 --svf $PROJECTNAME".svf" $PROJECTNAME"_out.config" $PROJECTNAME".bit" || exit
+ujprog -j FLASH $PROJECTNAME".bit"  || exit
+
--- a/BOARDS/tangprimer20k.cst
+++ b/BOARDS/tangprimer20k.cst
@@ -0,0 +1,143 @@
+IO_LOC  "clk" H11;
+IO_PORT "clk" IO_TYPE=LVCMOS33;
+IO_LOC  "key_i" T3;
+IO_LOC  "rst_i" T10;
+IO_PORT "rst_i" IO_TYPE=LVCMOS33;
+
+IO_LOC  "clk_i" IOT27A;
+
+IO_LOC  "led[0]" C13;
+IO_PORT "led[0]" IO_TYPE=LVCMOS33;
+IO_LOC  "led[1]" A13;
+IO_PORT "led[1]" IO_TYPE=LVCMOS33;
+IO_LOC  "led[2]" N16;
+IO_PORT "led[2]" IO_TYPE=LVCMOS33;
+IO_LOC  "led[3]" N14;
+IO_PORT "led[3]" IO_TYPE=LVCMOS33;
+IO_LOC  "led[4]" L14;
+IO_PORT "led[4]" IO_TYPE=LVCMOS33;
+IO_LOC  "led[5]" L16;
+IO_PORT "led[5]" IO_TYPE=LVCMOS33;
+
+IO_LOC  "TXD" A15;
+IO_PORT "TXD" IO_TYPE=LVCMOS33 PULL_MODE=UP;
+IO_LOC  "RXD" D14;
+IO_PORT "RXD" IO_TYPE=LVCMOS33 PULL_MODE=UP;
+
+// fake
+IO_LOC  "led[6]" A15;
+IO_PORT "led[6]" IO_TYPE=LVCMOS33 PULL_MODE=NONE;
+IO_LOC  "led[7]" D14;
+IO_PORT "led[7]" IO_TYPE=LVCMOS33 PULL_MODE=NONE;
+
+IO_LOC  "tlvds_p" P6;
+IO_PORT "tlvds_p" IO_TYPE=LVDS25 PULL_MODE=NONE;
+IO_LOC  "tlvds_n" T6;
+IO_PORT "tlvds_n" IO_TYPE=LVDS25 PULL_MODE=NONE;
+
+IO_LOC  "elvds_p" C12;
+IO_PORT "elvds_p" IO_TYPE=LVDS25 PULL_MODE=NONE;
+IO_LOC  "elvds_n" B12;
+IO_PORT "elvds_n" IO_TYPE=LVDS25 PULL_MODE=NONE;
+
+IO_LOC  "LED_R" C13;
+IO_PORT "LED_R" IO_TYPE=LVCMOS33;
+IO_LOC  "LED_G" A13;
+IO_PORT "LED_G" IO_TYPE=LVCMOS33;
+IO_LOC  "LED_B" N16;
+IO_PORT "LED_B" IO_TYPE=LVCMOS33;
+
+// oser
+IO_LOC  "oser_out" C13;
+IO_PORT "oser_out" IO_TYPE=LVCMOS33;
+IO_LOC  "fclk_o"   N16;
+IO_PORT "fclk_o"   IO_TYPE=LVCMOS33;
+IO_LOC  "pclk_o"   N14;
+IO_PORT "pclk_o"   IO_TYPE=LVCMOS33;
+
+// ides
+IO_LOC  "fclk_i" B13;
+IO_PORT "fclk_i" IO_TYPE=LVCMOS33;
+IO_LOC  "data_i" C12;
+IO_PORT "data_i" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[0]" P9;
+IO_PORT "q_o[0]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[1]" E15;
+IO_PORT "q_o[1]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[2]" T7;
+IO_PORT "q_o[2]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[3]" R8;
+IO_PORT "q_o[3]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[4]" T6;
+IO_PORT "q_o[4]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[5]" P6;
+IO_PORT "q_o[5]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[6]" T8;
+IO_PORT "q_o[6]" IO_TYPE=LVCMOS33;
+IO_LOC  "q_o[7]" P8;
+IO_PORT "q_o[7]" IO_TYPE=LVCMOS33;
+
+// RGB  LCD
+IO_LOC  "LCD_CLK"  R9;
+IO_PORT "LCD_CLK"  IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_HYNC" A15;
+IO_PORT "LCD_HYNC" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_SYNC" D14;
+IO_PORT "LCD_SYNC" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_DEN"  E15;
+IO_PORT "LCD_DEN"  IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_R[0]" L9;
+IO_PORT "LCD_R[0]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_R[1]" N8;
+IO_PORT "LCD_R[1]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_R[2]" N9;
+IO_PORT "LCD_R[2]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_R[3]" N7;
+IO_PORT "LCD_R[3]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_R[4]" N6;
+IO_PORT "LCD_R[4]" IO_TYPE=LVCMOS33;
+
+IO_LOC  "LCD_G[0]" D11;
+IO_PORT "LCD_G[0]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_G[1]" A11;
+IO_PORT "LCD_G[1]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_G[2]" B11;
+IO_PORT "LCD_G[2]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_G[3]" P7;
+IO_PORT "LCD_G[3]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_G[4]" R7;
+IO_PORT "LCD_G[4]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_G[5]" D10;
+IO_PORT "LCD_G[5]" IO_TYPE=LVCMOS33;
+
+IO_LOC  "LCD_B[0]" B12;
+IO_PORT "LCD_B[0]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_B[1]" C12;
+IO_PORT "LCD_B[1]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_B[2]" B13;
+IO_PORT "LCD_B[2]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_B[3]" A14;
+IO_PORT "LCD_B[3]" IO_TYPE=LVCMOS33;
+IO_LOC  "LCD_B[4]" B14;
+IO_PORT "LCD_B[4]" IO_TYPE=LVCMOS33;
+
+// DVI
+IO_LOC  "tmds_clk_p" G16;
+IO_PORT "tmds_clk_p" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_clk_n" H15;
+IO_PORT "tmds_clk_n" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_p[0]" H14;
+IO_PORT "tmds_d_p[0]" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_n[0]" H16;
+IO_PORT "tmds_d_n[0]" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_p[1]" J15;
+IO_PORT "tmds_d_p[1]" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_n[1]" K16;
+IO_PORT "tmds_d_n[1]" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_p[2]" K14;
+IO_PORT "tmds_d_p[2]" PULL_MODE=NONE DRIVE=3.5;
+IO_LOC  "tmds_d_n[2]" K15;
+IO_PORT "tmds_d_n[2]" PULL_MODE=NONE DRIVE=3.5;
+
+IO_LOC  "div_led" C13;
+IO_PORT "div_led" IO_TYPE=LVCMOS33;
--- a/BOARDS/ulx3s.lpf
+++ b/BOARDS/ulx3s.lpf
@@ -0,0 +1,35 @@
+# See https://github.com/emard/ulx3s/blob/master/doc/constraints/ulx3s_v20.lpf
+
+## Clock #########################################
+
+LOCATE COMP "CLK" SITE "G2";
+IOBUF PORT "CLK" PULLMODE=NONE IO_TYPE=LVCMOS33;
+FREQUENCY PORT "CLK" 25 MHZ;
+
+## RESET button ##################################
+
+LOCATE COMP "RESET" SITE "T1"; # fire 2 
+IOBUF PORT "RESET" IO_TYPE=LVCMOS33;
+
+## LEDs ##########################################
+
+LOCATE COMP "LEDS[0]" SITE "B2";
+LOCATE COMP "LEDS[1]" SITE "C2";
+LOCATE COMP "LEDS[2]" SITE "C1";
+LOCATE COMP "LEDS[3]" SITE "D2";
+LOCATE COMP "LEDS[4]" SITE "D1";
+
+IOBUF PORT "LEDS[0]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[1]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[2]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[3]"  IO_TYPE=LVCMOS33;
+IOBUF PORT "LEDS[4]"  IO_TYPE=LVCMOS33;
+
+## UART ######################################################
+
+LOCATE COMP "TXD"   SITE "L4"; # FPGA transmits to ftdi
+LOCATE COMP "RXD"   SITE "M1"; # FPGA receives from ftdi
+
+IOBUF PORT "TXD" PULLMODE=UP IO_TYPE=LVCMOS33 DRIVE=4;
+IOBUF PORT "RXD" PULLMODE=UP IO_TYPE=LVCMOS33;
+
--- a/FIRMWARE/COREMARK/core_list_join.c
+++ b/FIRMWARE/COREMARK/core_list_join.c
@@ -0,0 +1,595 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+        Benchmark using a linked list.
+
+        Linked list is a common data structure used in many applications.
+
+        For our purposes, this will excercise the memory units of the processor.
+        In particular, usage of the list pointers to find and alter data.
+
+        We are not using Malloc since some platforms do not support this
+library.
+
+        Instead, the memory block being passed in is used to create a list,
+        and the benchmark takes care not to add more items then can be
+        accommodated by the memory block. The porting layer will make sure
+        that we have a valid memory block.
+
+        All operations are done in place, without using any extra memory.
+
+        The list itself contains list pointers and pointers to data items.
+        Data items contain the following:
+
+        idx - An index that captures the initial order of the list.
+        data - Variable data initialized based on the input parameters. The 16b
+are divided as follows: o Upper 8b are backup of original data. o Bit 7
+indicates if the lower 7 bits are to be used as is or calculated. o Bits 0-2
+indicate type of operation to perform to get a 7b value. o Bits 3-6 provide
+input for the operation.
+
+*/
+
+/* local functions */
+
+list_head *core_list_find(list_head *list, list_data *info);
+list_head *core_list_reverse(list_head *list);
+list_head *core_list_remove(list_head *item);
+list_head *core_list_undo_remove(list_head *item_removed,
+                                 list_head *item_modified);
+list_head *core_list_insert_new(list_head * insert_point,
+                                list_data * info,
+                                list_head **memblock,
+                                list_data **datablock,
+                                list_head * memblock_end,
+                                list_data * datablock_end);
+typedef ee_s32 (*list_cmp)(list_data *a, list_data *b, core_results *res);
+list_head *core_list_mergesort(list_head *   list,
+                               list_cmp      cmp,
+                               core_results *res);
+
+ee_s16
+calc_func(ee_s16 *pdata, core_results *res)
+{
+    ee_s16 data = *pdata;
+    ee_s16 retval;
+    ee_u8  optype
+        = (data >> 7)
+          & 1;  /* bit 7 indicates if the function result has been cached */
+    if (optype) /* if cached, use cache */
+        return (data & 0x007f);
+    else
+    {                             /* otherwise calculate and cache the result */
+        ee_s16 flag = data & 0x7; /* bits 0-2 is type of function to perform */
+        ee_s16 dtype
+            = ((data >> 3)
+               & 0xf);       /* bits 3-6 is specific data for the operation */
+        dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */
+        switch (flag)
+        {
+            case 0:
+                if (dtype < 0x22) /* set min period for bit corruption */
+                    dtype = 0x22;
+                retval = core_bench_state(res->size,
+                                          res->memblock[3],
+                                          res->seed1,
+                                          res->seed2,
+                                          dtype,
+                                          res->crc);
+                if (res->crcstate == 0)
+                    res->crcstate = retval;
+                break;
+            case 1:
+                retval = core_bench_matrix(&(res->mat), dtype, res->crc);
+                if (res->crcmatrix == 0)
+                    res->crcmatrix = retval;
+                break;
+            default:
+                retval = data;
+                break;
+        }
+        res->crc = crcu16(retval, res->crc);
+        retval &= 0x007f;
+        *pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */
+        return retval;
+    }
+}
+/* Function: cmp_complex
+        Compare the data item in a list cell.
+
+        Can be used by mergesort.
+*/
+ee_s32
+cmp_complex(list_data *a, list_data *b, core_results *res)
+{
+    ee_s16 val1 = calc_func(&(a->data16), res);
+    ee_s16 val2 = calc_func(&(b->data16), res);
+    return val1 - val2;
+}
+
+/* Function: cmp_idx
+        Compare the idx item in a list cell, and regen the data.
+
+        Can be used by mergesort.
+*/
+ee_s32
+cmp_idx(list_data *a, list_data *b, core_results *res)
+{
+    if (res == NULL)
+    {
+        a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16 >> 8));
+        b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16 >> 8));
+    }
+    return a->idx - b->idx;
+}
+
+void
+copy_info(list_data *to, list_data *from)
+{
+    to->data16 = from->data16;
+    to->idx    = from->idx;
+}
+
+/* Benchmark for linked list:
+        - Try to find multiple data items.
+        - List sort
+        - Operate on data from list (crc)
+        - Single remove/reinsert
+        * At the end of this function, the list is back to original state
+*/
+ee_u16
+core_bench_list(core_results *res, ee_s16 finder_idx)
+{
+    ee_u16     retval = 0;
+    ee_u16     found = 0, missed = 0;
+    list_head *list     = res->list;
+    ee_s16     find_num = res->seed3;
+    list_head *this_find;
+    list_head *finder, *remover;
+    list_data  info;
+    ee_s16     i;
+
+    info.idx = finder_idx;
+    /* find <find_num> values in the list, and change the list each time
+     * (reverse and cache if value found) */
+    for (i = 0; i < find_num; i++)
+    {
+        info.data16 = (i & 0xff);
+        this_find   = core_list_find(list, &info);
+        list        = core_list_reverse(list);
+        if (this_find == NULL)
+        {
+            missed++;
+            retval += (list->next->info->data16 >> 8) & 1;
+        }
+        else
+        {
+            found++;
+            if (this_find->info->data16 & 0x1) /* use found value */
+                retval += (this_find->info->data16 >> 9) & 1;
+            /* and cache next item at the head of the list (if any) */
+            if (this_find->next != NULL)
+            {
+                finder          = this_find->next;
+                this_find->next = finder->next;
+                finder->next    = list->next;
+                list->next      = finder;
+            }
+        }
+        if (info.idx >= 0)
+            info.idx++;
+#if CORE_DEBUG
+        ee_printf("List find %d: [%d,%d,%d]\n", i, retval, missed, found);
+#endif
+    }
+    retval += found * 4 - missed;
+    /* sort the list by data content and remove one item*/
+    if (finder_idx > 0)
+        list = core_list_mergesort(list, cmp_complex, res);
+    remover = core_list_remove(list->next);
+    /* CRC data content of list from location of index N forward, and then undo
+     * remove */
+    finder = core_list_find(list, &info);
+    if (!finder)
+        finder = list->next;
+    while (finder)
+    {
+        retval = crc16(list->info->data16, retval);
+        finder = finder->next;
+    }
+#if CORE_DEBUG
+    ee_printf("List sort 1: %04x\n", retval);
+#endif
+    remover = core_list_undo_remove(remover, list->next);
+    /* sort the list by index, in effect returning the list to original state */
+    list = core_list_mergesort(list, cmp_idx, NULL);
+    /* CRC data content of list */
+    finder = list->next;
+    while (finder)
+    {
+        retval = crc16(list->info->data16, retval);
+        finder = finder->next;
+    }
+#if CORE_DEBUG
+    ee_printf("List sort 2: %04x\n", retval);
+#endif
+    return retval;
+}
+/* Function: core_list_init
+        Initialize list with data.
+
+        Parameters:
+        blksize - Size of memory to be initialized.
+        memblock - Pointer to memory block.
+        seed - 	Actual values chosen depend on the seed parameter.
+                The seed parameter MUST be supplied from a source that cannot be
+   determined at compile time
+
+        Returns:
+        Pointer to the head of the list.
+
+*/
+list_head *
+core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed)
+{
+    /* calculated pointers for the list */
+    ee_u32 per_item = 16 + sizeof(struct list_data_s);
+    ee_u32 size     = (blksize / per_item)
+                  - 2; /* to accommodate systems with 64b pointers, and make sure
+                          same code is executed, set max list elements */
+    list_head *memblock_end  = memblock + size;
+    list_data *datablock     = (list_data *)(memblock_end);
+    list_data *datablock_end = datablock + size;
+    /* some useful variables */
+    ee_u32     i;
+    list_head *finder, *list = memblock;
+    list_data  info;
+
+    /* create a fake items for the list head and tail */
+    list->next         = NULL;
+    list->info         = datablock;
+    list->info->idx    = 0x0000;
+    list->info->data16 = (ee_s16)0x8080;
+    memblock++;
+    datablock++;
+    info.idx    = 0x7fff;
+    info.data16 = (ee_s16)0xffff;
+    core_list_insert_new(
+        list, &info, &memblock, &datablock, memblock_end, datablock_end);
+
+    /* then insert size items */
+    for (i = 0; i < size; i++)
+    {
+        ee_u16 datpat = ((ee_u16)(seed ^ i) & 0xf);
+        ee_u16 dat
+            = (datpat << 3) | (i & 0x7); /* alternate between algorithms */
+        info.data16 = (dat << 8) | dat;  /* fill the data with actual data and
+                                            upper bits with rebuild value */
+        core_list_insert_new(
+            list, &info, &memblock, &datablock, memblock_end, datablock_end);
+    }
+    /* and now index the list so we know initial seed order of the list */
+    finder = list->next;
+    i      = 1;
+    while (finder->next != NULL)
+    {
+        if (i < size / 5) /* first 20% of the list in order */
+            finder->info->idx = i++;
+        else
+        {
+            ee_u16 pat = (ee_u16)(i++ ^ seed); /* get a pseudo random number */
+            finder->info->idx = 0x3fff
+                                & (((i & 0x07) << 8)
+                                   | pat); /* make sure the mixed items end up
+                                              after the ones in sequence */
+        }
+        finder = finder->next;
+    }
+    list = core_list_mergesort(list, cmp_idx, NULL);
+#if CORE_DEBUG
+    ee_printf("Initialized list:\n");
+    finder = list;
+    while (finder)
+    {
+        ee_printf(
+            "[%04x,%04x]", finder->info->idx, (ee_u16)finder->info->data16);
+        finder = finder->next;
+    }
+    ee_printf("\n");
+#endif
+    return list;
+}
+
+/* Function: core_list_insert
+        Insert an item to the list
+
+        Parameters:
+        insert_point - where to insert the item.
+        info - data for the cell.
+        memblock - pointer for the list header
+        datablock - pointer for the list data
+        memblock_end - end of region for list headers
+        datablock_end - end of region for list data
+
+        Returns:
+        Pointer to new item.
+*/
+list_head *
+core_list_insert_new(list_head * insert_point,
+                     list_data * info,
+                     list_head **memblock,
+                     list_data **datablock,
+                     list_head * memblock_end,
+                     list_data * datablock_end)
+{
+    list_head *newitem;
+
+    if ((*memblock + 1) >= memblock_end)
+        return NULL;
+    if ((*datablock + 1) >= datablock_end)
+        return NULL;
+
+    newitem = *memblock;
+    (*memblock)++;
+    newitem->next      = insert_point->next;
+    insert_point->next = newitem;
+
+    newitem->info = *datablock;
+    (*datablock)++;
+    copy_info(newitem->info, info);
+
+    return newitem;
+}
+
+/* Function: core_list_remove
+        Remove an item from the list.
+
+        Operation:
+        For a singly linked list, remove by copying the data from the next item
+        over to the current cell, and unlinking the next item.
+
+        Note:
+        since there is always a fake item at the end of the list, no need to
+   check for NULL.
+
+        Returns:
+        Removed item.
+*/
+list_head *
+core_list_remove(list_head *item)
+{
+    list_data *tmp;
+    list_head *ret = item->next;
+    /* swap data pointers */
+    tmp        = item->info;
+    item->info = ret->info;
+    ret->info  = tmp;
+    /* and eliminate item */
+    item->next = item->next->next;
+    ret->next  = NULL;
+    return ret;
+}
+
+/* Function: core_list_undo_remove
+        Undo a remove operation.
+
+        Operation:
+        Since we want each iteration of the benchmark to be exactly the same,
+        we need to be able to undo a remove.
+        Link the removed item back into the list, and switch the info items.
+
+        Parameters:
+        item_removed - Return value from the <core_list_remove>
+        item_modified - List item that was modified during <core_list_remove>
+
+        Returns:
+        The item that was linked back to the list.
+
+*/
+list_head *
+core_list_undo_remove(list_head *item_removed, list_head *item_modified)
+{
+    list_data *tmp;
+    /* swap data pointers */
+    tmp                 = item_removed->info;
+    item_removed->info  = item_modified->info;
+    item_modified->info = tmp;
+    /* and insert item */
+    item_removed->next  = item_modified->next;
+    item_modified->next = item_removed;
+    return item_removed;
+}
+
+/* Function: core_list_find
+        Find an item in the list
+
+        Operation:
+        Find an item by idx (if not 0) or specific data value
+
+        Parameters:
+        list - list head
+        info - idx or data to find
+
+        Returns:
+        Found item, or NULL if not found.
+*/
+list_head *
+core_list_find(list_head *list, list_data *info)
+{
+    if (info->idx >= 0)
+    {
+        while (list && (list->info->idx != info->idx))
+            list = list->next;
+        return list;
+    }
+    else
+    {
+        while (list && ((list->info->data16 & 0xff) != info->data16))
+            list = list->next;
+        return list;
+    }
+}
+/* Function: core_list_reverse
+        Reverse a list
+
+        Operation:
+        Rearrange the pointers so the list is reversed.
+
+        Parameters:
+        list - list head
+        info - idx or data to find
+
+        Returns:
+        Found item, or NULL if not found.
+*/
+
+list_head *
+core_list_reverse(list_head *list)
+{
+    list_head *next = NULL, *tmp;
+    while (list)
+    {
+        tmp        = list->next;
+        list->next = next;
+        next       = list;
+        list       = tmp;
+    }
+    return next;
+}
+/* Function: core_list_mergesort
+        Sort the list in place without recursion.
+
+        Description:
+        Use mergesort, as for linked list this is a realistic solution.
+        Also, since this is aimed at embedded, care was taken to use iterative
+   rather then recursive algorithm. The sort can either return the list to
+   original order (by idx) , or use the data item to invoke other other
+   algorithms and change the order of the list.
+
+        Parameters:
+        list - list to be sorted.
+        cmp - cmp function to use
+
+        Returns:
+        New head of the list.
+
+        Note:
+        We have a special header for the list that will always be first,
+        but the algorithm could theoretically modify where the list starts.
+
+ */
+list_head *
+core_list_mergesort(list_head *list, list_cmp cmp, core_results *res)
+{
+    list_head *p, *q, *e, *tail;
+    ee_s32     insize, nmerges, psize, qsize, i;
+
+    insize = 1;
+
+    while (1)
+    {
+        p    = list;
+        list = NULL;
+        tail = NULL;
+
+        nmerges = 0; /* count number of merges we do in this pass */
+
+        while (p)
+        {
+            nmerges++; /* there exists a merge to be done */
+            /* step `insize' places along from p */
+            q     = p;
+            psize = 0;
+            for (i = 0; i < insize; i++)
+            {
+                psize++;
+                q = q->next;
+                if (!q)
+                    break;
+            }
+
+            /* if q hasn't fallen off end, we have two lists to merge */
+            qsize = insize;
+
+            /* now we have two lists; merge them */
+            while (psize > 0 || (qsize > 0 && q))
+            {
+
+                /* decide whether next element of merge comes from p or q */
+                if (psize == 0)
+                {
+                    /* p is empty; e must come from q. */
+                    e = q;
+                    q = q->next;
+                    qsize--;
+                }
+                else if (qsize == 0 || !q)
+                {
+                    /* q is empty; e must come from p. */
+                    e = p;
+                    p = p->next;
+                    psize--;
+                }
+                else if (cmp(p->info, q->info, res) <= 0)
+                {
+                    /* First element of p is lower (or same); e must come from
+                     * p. */
+                    e = p;
+                    p = p->next;
+                    psize--;
+                }
+                else
+                {
+                    /* First element of q is lower; e must come from q. */
+                    e = q;
+                    q = q->next;
+                    qsize--;
+                }
+
+                /* add the next element to the merged list */
+                if (tail)
+                {
+                    tail->next = e;
+                }
+                else
+                {
+                    list = e;
+                }
+                tail = e;
+            }
+
+            /* now p has stepped `insize' places along, and q has too */
+            p = q;
+        }
+
+        tail->next = NULL;
+
+        /* If we have done only one merge, we're finished. */
+        if (nmerges <= 1) /* allow for nmerges==0, the empty list case */
+            return list;
+
+        /* Otherwise repeat, merging lists twice the size */
+        insize *= 2;
+    }
+#if COMPILER_REQUIRES_SORT_RETURN
+    return list;
+#endif
+}
--- a/FIRMWARE/COREMARK/core_main.c
+++ b/FIRMWARE/COREMARK/core_main.c
@@ -0,0 +1,451 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* File: core_main.c
+        This file contains the framework to acquire a block of memory, seed
+   initial parameters, tun t he benchmark and report the results.
+*/
+#include "coremark.h"
+
+/* Function: iterate
+        Run the benchmark for a specified number of iterations.
+
+        Operation:
+        For each type of benchmarked algorithm:
+                a - Initialize the data block for the algorithm.
+                b - Execute the algorithm N times.
+
+        Returns:
+        NULL.
+*/
+static ee_u16 list_known_crc[]   = { (ee_u16)0xd4b0,
+                                   (ee_u16)0x3340,
+                                   (ee_u16)0x6a79,
+                                   (ee_u16)0xe714,
+                                   (ee_u16)0xe3c1 };
+static ee_u16 matrix_known_crc[] = { (ee_u16)0xbe52,
+                                     (ee_u16)0x1199,
+                                     (ee_u16)0x5608,
+                                     (ee_u16)0x1fd7,
+                                     (ee_u16)0x0747 };
+static ee_u16 state_known_crc[]  = { (ee_u16)0x5e47,
+                                    (ee_u16)0x39bf,
+                                    (ee_u16)0xe5a4,
+                                    (ee_u16)0x8e3a,
+                                    (ee_u16)0x8d84 };
+void *
+iterate(void *pres)
+{
+    ee_u32        i;
+    ee_u16        crc;
+    core_results *res        = (core_results *)pres;
+    ee_u32        iterations = res->iterations;
+    res->crc                 = 0;
+    res->crclist             = 0;
+    res->crcmatrix           = 0;
+    res->crcstate            = 0;
+
+    for (i = 0; i < iterations; i++)
+    {
+        crc      = core_bench_list(res, 1);
+        res->crc = crcu16(crc, res->crc);
+        crc      = core_bench_list(res, -1);
+        res->crc = crcu16(crc, res->crc);
+        if (i == 0)
+            res->crclist = res->crc;
+    }
+    return NULL;
+}
+
+#if (SEED_METHOD == SEED_ARG)
+ee_s32 get_seed_args(int i, int argc, char *argv[]);
+#define get_seed(x)    (ee_s16) get_seed_args(x, argc, argv)
+#define get_seed_32(x) get_seed_args(x, argc, argv)
+#else /* via function or volatile */
+ee_s32 get_seed_32(int i);
+#define get_seed(x) (ee_s16) get_seed_32(x)
+#endif
+
+#if (MEM_METHOD == MEM_STATIC)
+ee_u8 static_memblk[TOTAL_DATA_SIZE];
+#endif
+char *mem_name[3] = { "Static", "Heap", "Stack" };
+/* Function: main
+        Main entry routine for the benchmark.
+        This function is responsible for the following steps:
+
+        1 - Initialize input seeds from a source that cannot be determined at
+   compile time. 2 - Initialize memory block for use. 3 - Run and time the
+   benchmark. 4 - Report results, testing the validity of the output if the
+   seeds are known.
+
+        Arguments:
+        1 - first seed  : Any value
+        2 - second seed : Must be identical to first for iterations to be
+   identical 3 - third seed  : Any value, should be at least an order of
+   magnitude less then the input size, but bigger then 32. 4 - Iterations  :
+   Special, if set to 0, iterations will be automatically determined such that
+   the benchmark will run between 10 to 100 secs
+
+*/
+
+#if MAIN_HAS_NOARGC
+MAIN_RETURN_TYPE
+main(void)
+{
+    int   argc = 0;
+    char *argv[1];
+#else
+MAIN_RETURN_TYPE
+main(int argc, char *argv[])
+{
+#endif
+    ee_u16       i, j = 0, num_algorithms = 0;
+    ee_s16       known_id = -1, total_errors = 0;
+    ee_u16       seedcrc = 0;
+    CORE_TICKS   total_time;
+    core_results results[MULTITHREAD];
+#if (MEM_METHOD == MEM_STACK)
+    ee_u8 stack_memblock[TOTAL_DATA_SIZE * MULTITHREAD];
+#endif
+    /* first call any initializations needed */
+    portable_init(&(results[0].port), &argc, argv);
+    /* First some checks to make sure benchmark will run ok */
+    if (sizeof(struct list_head_s) > 128)
+    {
+        ee_printf("list_head structure too big for comparable data!\n");
+        return MAIN_RETURN_VAL;
+    }
+    results[0].seed1      = get_seed(1);
+    results[0].seed2      = get_seed(2);
+    results[0].seed3      = get_seed(3);
+    results[0].iterations = get_seed_32(4);
+#if CORE_DEBUG
+    results[0].iterations = 1;
+#endif
+    results[0].execs = get_seed_32(5);
+    if (results[0].execs == 0)
+    { /* if not supplied, execute all algorithms */
+        results[0].execs = ALL_ALGORITHMS_MASK;
+    }
+    /* put in some default values based on one seed only for easy testing */
+    if ((results[0].seed1 == 0) && (results[0].seed2 == 0)
+        && (results[0].seed3 == 0))
+    { /* performance run */
+        results[0].seed1 = 0;
+        results[0].seed2 = 0;
+        results[0].seed3 = 0x66;
+    }
+    if ((results[0].seed1 == 1) && (results[0].seed2 == 0)
+        && (results[0].seed3 == 0))
+    { /* validation run */
+        results[0].seed1 = 0x3415;
+        results[0].seed2 = 0x3415;
+        results[0].seed3 = 0x66;
+    }
+#if (MEM_METHOD == MEM_STATIC)
+    results[0].memblock[0] = (void *)static_memblk;
+    results[0].size        = TOTAL_DATA_SIZE;
+    results[0].err         = 0;
+#if (MULTITHREAD > 1)
+#error "Cannot use a static data area with multiple contexts!"
+#endif
+#elif (MEM_METHOD == MEM_MALLOC)
+    for (i = 0; i < MULTITHREAD; i++)
+    {
+        ee_s32 malloc_override = get_seed(7);
+        if (malloc_override != 0)
+            results[i].size = malloc_override;
+        else
+            results[i].size = TOTAL_DATA_SIZE;
+        results[i].memblock[0] = portable_malloc(results[i].size);
+        results[i].seed1       = results[0].seed1;
+        results[i].seed2       = results[0].seed2;
+        results[i].seed3       = results[0].seed3;
+        results[i].err         = 0;
+        results[i].execs       = results[0].execs;
+    }
+#elif (MEM_METHOD == MEM_STACK)
+for (i = 0; i < MULTITHREAD; i++)
+{
+    results[i].memblock[0] = stack_memblock + i * TOTAL_DATA_SIZE;
+    results[i].size        = TOTAL_DATA_SIZE;
+    results[i].seed1       = results[0].seed1;
+    results[i].seed2       = results[0].seed2;
+    results[i].seed3       = results[0].seed3;
+    results[i].err         = 0;
+    results[i].execs       = results[0].execs;
+}
+#else
+#error "Please define a way to initialize a memory block."
+#endif
+    /* Data init */
+    /* Find out how space much we have based on number of algorithms */
+    for (i = 0; i < NUM_ALGORITHMS; i++)
+    {
+        if ((1 << (ee_u32)i) & results[0].execs)
+            num_algorithms++;
+    }
+    for (i = 0; i < MULTITHREAD; i++)
+        results[i].size = results[i].size / num_algorithms;
+    /* Assign pointers */
+    for (i = 0; i < NUM_ALGORITHMS; i++)
+    {
+        ee_u32 ctx;
+        if ((1 << (ee_u32)i) & results[0].execs)
+        {
+            for (ctx = 0; ctx < MULTITHREAD; ctx++)
+                results[ctx].memblock[i + 1]
+                    = (char *)(results[ctx].memblock[0]) + results[0].size * j;
+            j++;
+        }
+    }
+    /* call inits */
+    for (i = 0; i < MULTITHREAD; i++)
+    {
+        if (results[i].execs & ID_LIST)
+        {
+            results[i].list = core_list_init(
+                results[0].size, results[i].memblock[1], results[i].seed1);
+        }
+        if (results[i].execs & ID_MATRIX)
+        {
+            core_init_matrix(results[0].size,
+                             results[i].memblock[2],
+                             (ee_s32)results[i].seed1
+                                 | (((ee_s32)results[i].seed2) << 16),
+                             &(results[i].mat));
+        }
+        if (results[i].execs & ID_STATE)
+        {
+            core_init_state(
+                results[0].size, results[i].seed1, results[i].memblock[3]);
+        }
+    }
+
+    /* automatically determine number of iterations if not set */
+    if (results[0].iterations == 0)
+    {
+        secs_ret secs_passed = 0;
+        ee_u32   divisor;
+        results[0].iterations = 1;
+        while (secs_passed < (secs_ret)1)
+        {
+            results[0].iterations *= 10;
+            start_time();
+            iterate(&results[0]);
+            stop_time();
+            secs_passed = time_in_secs(get_time());
+        }
+        /* now we know it executes for at least 1 sec, set actual run time at
+         * about 10 secs */
+        divisor = (ee_u32)secs_passed;
+        if (divisor == 0) /* some machines cast float to int as 0 since this
+                             conversion is not defined by ANSI, but we know at
+                             least one second passed */
+            divisor = 1;
+        results[0].iterations *= 1 + 10 / divisor;
+    }
+    /* perform actual benchmark */
+    start_time();
+#if (MULTITHREAD > 1)
+    if (default_num_contexts > MULTITHREAD)
+    {
+        default_num_contexts = MULTITHREAD;
+    }
+    for (i = 0; i < default_num_contexts; i++)
+    {
+        results[i].iterations = results[0].iterations;
+        results[i].execs      = results[0].execs;
+        core_start_parallel(&results[i]);
+    }
+    for (i = 0; i < default_num_contexts; i++)
+    {
+        core_stop_parallel(&results[i]);
+    }
+#else
+    iterate(&results[0]);
+#endif
+    stop_time();
+    total_time = get_time();
+    /* get a function of the input to report */
+    seedcrc = crc16(results[0].seed1, seedcrc);
+    seedcrc = crc16(results[0].seed2, seedcrc);
+    seedcrc = crc16(results[0].seed3, seedcrc);
+    seedcrc = crc16(results[0].size, seedcrc);
+
+    switch (seedcrc)
+    {                /* test known output for common seeds */
+        case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
+            known_id = 0;
+            ee_printf("6k performance run parameters for coremark.\n");
+            break;
+        case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per
+                        algorithm */
+            known_id = 1;
+            ee_printf("6k validation run parameters for coremark.\n");
+            break;
+        case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm
+                      */
+            known_id = 2;
+            ee_printf("Profile generation run parameters for coremark.\n");
+            break;
+        case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
+            known_id = 3;
+            ee_printf("2K performance run parameters for coremark.\n");
+            break;
+        case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per
+                        algorithm */
+            known_id = 4;
+            ee_printf("2K validation run parameters for coremark.\n");
+            break;
+        default:
+            total_errors = -1;
+            break;
+    }
+    if (known_id >= 0)
+    {
+        for (i = 0; i < default_num_contexts; i++)
+        {
+            results[i].err = 0;
+            if ((results[i].execs & ID_LIST)
+                && (results[i].crclist != list_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crclist,
+                          list_known_crc[known_id]);
+                results[i].err++;
+            }
+            if ((results[i].execs & ID_MATRIX)
+                && (results[i].crcmatrix != matrix_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crcmatrix,
+                          matrix_known_crc[known_id]);
+                results[i].err++;
+            }
+            if ((results[i].execs & ID_STATE)
+                && (results[i].crcstate != state_known_crc[known_id]))
+            {
+                ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",
+                          i,
+                          results[i].crcstate,
+                          state_known_crc[known_id]);
+                results[i].err++;
+            }
+            total_errors += results[i].err;
+        }
+    }
+    total_errors += check_data_types();
+    /* and report results */
+    ee_printf("CoreMark Size    : %lu\n", (long unsigned)results[0].size);
+    ee_printf("Total ticks      : %lu\n", (long unsigned)total_time);
+#if HAS_FLOAT
+    ee_printf("Total time (secs): %f\n", time_in_secs(total_time));
+    if (time_in_secs(total_time) > 0)
+        ee_printf("Iterations/Sec   : %f\n",
+                  (default_num_contexts * results[0].iterations)
+                      / time_in_secs(total_time)); 
+#else
+    /*
+    ee_printf("Total time (secs): %d\n", time_in_secs(total_time));
+    if (time_in_secs(total_time) > 0)
+        ee_printf("Iterations/Sec   : %d\n",
+                  default_num_contexts * results[0].iterations
+                      / time_in_secs(total_time));
+    */
+#endif
+
+
+    print_coremarks(total_time);
+    
+    if (time_in_secs(total_time) < 10)
+    {
+        ee_printf(
+            "ERROR! Must execute for at least 10 secs for a valid result!\n");
+        // total_errors++; 
+    }
+
+    /*
+    ee_printf("Iterations       : %lu\n",
+              (long unsigned)default_num_contexts * results[0].iterations);
+    ee_printf("Compiler version : %s\n", COMPILER_VERSION);
+    ee_printf("Compiler flags   : %s\n", COMPILER_FLAGS);
+    */
+    
+#if (MULTITHREAD > 1)
+    ee_printf("Parallel %s : %d\n", PARALLEL_METHOD, default_num_contexts);
+#endif
+    ee_printf("Memory location  : %s\n", MEM_LOCATION);
+    /* output for verification */
+    ee_printf("seedcrc          : 0x%04x\n", seedcrc);
+    if (results[0].execs & ID_LIST)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crclist       : 0x%04x\n", i, results[i].crclist);
+    if (results[0].execs & ID_MATRIX)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crcmatrix     : 0x%04x\n", i, results[i].crcmatrix);
+    if (results[0].execs & ID_STATE)
+        for (i = 0; i < default_num_contexts; i++)
+            ee_printf("[%d]crcstate      : 0x%04x\n", i, results[i].crcstate);
+    for (i = 0; i < default_num_contexts; i++)
+        ee_printf("[%d]crcfinal      : 0x%04x\n", i, results[i].crc);
+    if (total_errors == 0)
+    {
+        ee_printf(
+            "Correct operation validated. See README.md for run and reporting "
+            "rules.\n");
+#if HAS_FLOAT
+        if (known_id == 3)
+        {
+            ee_printf("CoreMark 1.0 : %f / %s %s",
+                      default_num_contexts * results[0].iterations
+                          / time_in_secs(total_time),
+                      COMPILER_VERSION,
+                      COMPILER_FLAGS);
+#if defined(MEM_LOCATION) && !defined(MEM_LOCATION_UNSPEC)
+            ee_printf(" / %s", MEM_LOCATION);
+#else
+            ee_printf(" / %s", mem_name[MEM_METHOD]);
+#endif
+
+#if (MULTITHREAD > 1)
+            ee_printf(" / %d:%s", default_num_contexts, PARALLEL_METHOD);
+#endif
+            ee_printf("\n");
+        }
+#endif
+    }
+    if (total_errors > 0)
+        ee_printf("Errors detected\n");
+    if (total_errors < 0)
+        ee_printf(
+            "Cannot validate operation for these seed values, please compare "
+            "with results on a known platform.\n");
+
+#if (MEM_METHOD == MEM_MALLOC)
+    for (i = 0; i < MULTITHREAD; i++)
+        portable_free(results[i].memblock[0]);
+#endif
+    /* And last call any target specific code for finalizing */
+    portable_fini(&(results[0].port));
+
+    return MAIN_RETURN_VAL;
+}
--- a/FIRMWARE/COREMARK/core_matrix.c
+++ b/FIRMWARE/COREMARK/core_matrix.c
@@ -0,0 +1,359 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+        Matrix manipulation benchmark
+
+        This very simple algorithm forms the basis of many more complex
+algorithms.
+
+        The tight inner loop is the focus of many optimizations (compiler as
+well as hardware based) and is thus relevant for embedded processing.
+
+        The total available data space will be divided to 3 parts:
+        NxN Matrix A - initialized with small values (upper 3/4 of the bits all
+zero). NxN Matrix B - initialized with medium values (upper half of the bits all
+zero). NxN Matrix C - used for the result.
+
+        The actual values for A and B must be derived based on input that is not
+available at compile time.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
+void   matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
+void   matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void   matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
+
+#define matrix_test_next(x)      (x + 1)
+#define matrix_clip(x, y)        ((y) ? (x)&0x0ff : (x)&0x0ffff)
+#define matrix_big(x)            (0xf000 | (x))
+#define bit_extract(x, from, to) (((x) >> (from)) & (~(0xffffffff << (to))))
+
+#if CORE_DEBUG
+void
+printmat(MATDAT *A, ee_u32 N, char *name)
+{
+    ee_u32 i, j;
+    ee_printf("Matrix %s [%dx%d]:\n", name, N, N);
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            if (j != 0)
+                ee_printf(",");
+            ee_printf("%d", A[i * N + j]);
+        }
+        ee_printf("\n");
+    }
+}
+void
+printmatC(MATRES *C, ee_u32 N, char *name)
+{
+    ee_u32 i, j;
+    ee_printf("Matrix %s [%dx%d]:\n", name, N, N);
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            if (j != 0)
+                ee_printf(",");
+            ee_printf("%d", C[i * N + j]);
+        }
+        ee_printf("\n");
+    }
+}
+#endif
+/* Function: core_bench_matrix
+        Benchmark function
+
+        Iterate <matrix_test> N times,
+        changing the matrix values slightly by a constant amount each time.
+*/
+ee_u16
+core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc)
+{
+    ee_u32  N   = p->N;
+    MATRES *C   = p->C;
+    MATDAT *A   = p->A;
+    MATDAT *B   = p->B;
+    MATDAT  val = (MATDAT)seed;
+
+    crc = crc16(matrix_test(N, C, A, B, val), crc);
+
+    return crc;
+}
+
+/* Function: matrix_test
+        Perform matrix manipulation.
+
+        Parameters:
+        N - Dimensions of the matrix.
+        C - memory for result matrix.
+        A - input matrix
+        B - operator matrix (not changed during operations)
+
+        Returns:
+        A CRC value that captures all results calculated in the function.
+        In particular, crc of the value calculated on the result matrix
+        after each step by <matrix_sum>.
+
+        Operation:
+
+        1 - Add a constant value to all elements of a matrix.
+        2 - Multiply a matrix by a constant.
+        3 - Multiply a matrix by a vector.
+        4 - Multiply a matrix by a matrix.
+        5 - Add a constant value to all elements of a matrix.
+
+        After the last step, matrix A is back to original contents.
+*/
+ee_s16
+matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val)
+{
+    ee_u16 crc     = 0;
+    MATDAT clipval = matrix_big(val);
+
+    matrix_add_const(N, A, val); /* make sure data changes  */
+#if CORE_DEBUG
+    printmat(A, N, "matrix_add_const");
+#endif
+    matrix_mul_const(N, C, A, val);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
+#if CORE_DEBUG
+    printmatC(C, N, "matrix_mul_const");
+#endif
+    matrix_mul_vect(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
+#if CORE_DEBUG
+    printmatC(C, N, "matrix_mul_vect");
+#endif
+    matrix_mul_matrix(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
+#if CORE_DEBUG
+    printmatC(C, N, "matrix_mul_matrix");
+#endif
+    matrix_mul_matrix_bitextract(N, C, A, B);
+    crc = crc16(matrix_sum(N, C, clipval), crc);
+#if CORE_DEBUG
+    printmatC(C, N, "matrix_mul_matrix_bitextract");
+#endif
+
+    matrix_add_const(N, A, -val); /* return matrix to initial value */
+    return crc;
+}
+
+/* Function : matrix_init
+        Initialize the memory block for matrix benchmarking.
+
+        Parameters:
+        blksize - Size of memory to be initialized.
+        memblk - Pointer to memory block.
+        seed - Actual values chosen depend on the seed parameter.
+        p - pointers to <mat_params> containing initialized matrixes.
+
+        Returns:
+        Matrix dimensions.
+
+        Note:
+        The seed parameter MUST be supplied from a source that cannot be
+   determined at compile time
+*/
+ee_u32
+core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p)
+{
+    ee_u32  N = 0;
+    MATDAT *A;
+    MATDAT *B;
+    ee_s32  order = 1;
+    MATDAT  val;
+    ee_u32  i = 0, j = 0;
+    if (seed == 0)
+        seed = 1;
+    while (j < blksize)
+    {
+        i++;
+        j = i * i * 2 * 4;
+    }
+    N = i - 1;
+    A = (MATDAT *)align_mem(memblk);
+    B = A + N * N;
+
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            seed         = ((order * seed) % 65536);
+            val          = (seed + order);
+            val          = matrix_clip(val, 0);
+            B[i * N + j] = val;
+            val          = (val + order);
+            val          = matrix_clip(val, 1);
+            A[i * N + j] = val;
+            order++;
+        }
+    }
+
+    p->A = A;
+    p->B = B;
+    p->C = (MATRES *)align_mem(B + N * N);
+    p->N = N;
+#if CORE_DEBUG
+    printmat(A, N, "A");
+    printmat(B, N, "B");
+#endif
+    return N;
+}
+
+/* Function: matrix_sum
+        Calculate a function that depends on the values of elements in the
+   matrix.
+
+        For each element, accumulate into a temporary variable.
+
+        As long as this value is under the parameter clipval,
+        add 1 to the result if the element is bigger then the previous.
+
+        Otherwise, reset the accumulator and add 10 to the result.
+*/
+ee_s16
+matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval)
+{
+    MATRES tmp = 0, prev = 0, cur = 0;
+    ee_s16 ret = 0;
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            cur = C[i * N + j];
+            tmp += cur;
+            if (tmp > clipval)
+            {
+                ret += 10;
+                tmp = 0;
+            }
+            else
+            {
+                ret += (cur > prev) ? 1 : 0;
+            }
+            prev = cur;
+        }
+    }
+    return ret;
+}
+
+/* Function: matrix_mul_const
+        Multiply a matrix by a constant.
+        This could be used as a scaler for instance.
+*/
+void
+matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = (MATRES)A[i * N + j] * (MATRES)val;
+        }
+    }
+}
+
+/* Function: matrix_add_const
+        Add a constant value to all elements of a matrix.
+*/
+void
+matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            A[i * N + j] += val;
+        }
+    }
+}
+
+/* Function: matrix_mul_vect
+        Multiply a matrix by a vector.
+        This is common in many simple filters (e.g. fir where a vector of
+   coefficients is applied to the matrix.)
+*/
+void
+matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j;
+    for (i = 0; i < N; i++)
+    {
+        C[i] = 0;
+        for (j = 0; j < N; j++)
+        {
+            C[i] += (MATRES)A[i * N + j] * (MATRES)B[j];
+        }
+    }
+}
+
+/* Function: matrix_mul_matrix
+        Multiply a matrix by a matrix.
+        Basic code is used in many algorithms, mostly with minor changes such as
+   scaling.
+*/
+void
+matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j, k;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = 0;
+            for (k = 0; k < N; k++)
+            {
+                C[i * N + j] += (MATRES)A[i * N + k] * (MATRES)B[k * N + j];
+            }
+        }
+    }
+}
+
+/* Function: matrix_mul_matrix_bitextract
+        Multiply a matrix by a matrix, and extract some bits from the result.
+        Basic code is used in many algorithms, mostly with minor changes such as
+   scaling.
+*/
+void
+matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B)
+{
+    ee_u32 i, j, k;
+    for (i = 0; i < N; i++)
+    {
+        for (j = 0; j < N; j++)
+        {
+            C[i * N + j] = 0;
+            for (k = 0; k < N; k++)
+            {
+                MATRES tmp = (MATRES)A[i * N + k] * (MATRES)B[k * N + j];
+                C[i * N + j] += bit_extract(tmp, 2, 4) * bit_extract(tmp, 5, 7);
+            }
+        }
+    }
+}
--- a/FIRMWARE/COREMARK/core_portme.c
+++ b/FIRMWARE/COREMARK/core_portme.c
@@ -0,0 +1,215 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+#include <io.h>
+#include <stdio.h>
+#include "coremark.h"
+#include "core_portme.h"
+#include <perf.h>
+
+#if VALIDATION_RUN
+volatile ee_s32 seed1_volatile = 0x3415;
+volatile ee_s32 seed2_volatile = 0x3415;
+volatile ee_s32 seed3_volatile = 0x66;
+#endif
+#if PERFORMANCE_RUN
+volatile ee_s32 seed1_volatile = 0x0;
+volatile ee_s32 seed2_volatile = 0x0;
+volatile ee_s32 seed3_volatile = 0x66;
+#endif
+#if PROFILE_RUN
+volatile ee_s32 seed1_volatile = 0x8;
+volatile ee_s32 seed2_volatile = 0x8;
+volatile ee_s32 seed3_volatile = 0x8;
+#endif
+volatile ee_s32 seed4_volatile = ITERATIONS;
+volatile ee_s32 seed5_volatile = 0;
+
+/* Porting : Timing functions
+        How to capture time and convert to seconds must be ported to whatever is
+   supported by the platform. e.g. Read value from on board RTC, read value from
+   cpu clock cycles performance counter etc. Sample implementation for standard
+   time.h and windows.h definitions included.
+*/
+CORETIMETYPE  barebones_clock()
+{
+   return (CORETIMETYPE)(rdcycle());
+}
+
+/* Define : TIMER_RES_DIVIDER
+        Divider to trade off timer resolution and total time that can be
+   measured.
+
+        Use lower values to increase resolution, but make sure that overflow
+   does not occur. If there are issues with the return value overflowing,
+   increase this value.
+        */
+#define CLOCKS_PER_SEC             10000000
+#define GETMYTIME(_t)              (*_t = barebones_clock())
+#define MYTIMEDIFF(fin, ini)       ((fin) - (ini))
+#define TIMER_RES_DIVIDER          1
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#define EE_TICKS_PER_SEC           (CLOCKS_PER_SEC / TIMER_RES_DIVIDER)
+
+/** Define Host specific (POSIX), or target specific global time variables. */
+static CORETIMETYPE start_time_val, stop_time_val;
+
+/* Function : start_time
+        This function will be called right before starting the timed portion of
+   the benchmark.
+
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or zeroing some system parameters - e.g. setting the cpu clocks
+   cycles to 0.
+*/
+void
+start_time(void)
+{
+    GETMYTIME(&start_time_val);
+}
+/* Function : stop_time
+        This function will be called right after ending the timed portion of the
+   benchmark.
+
+        Implementation may be capturing a system timer (as implemented in the
+   example code) or other system parameters - e.g. reading the current value of
+   cpu cycles counter.
+*/
+void
+stop_time(void)
+{
+    GETMYTIME(&stop_time_val);
+}
+/* Function : get_time
+        Return an abstract "ticks" number that signifies time on the system.
+
+        Actual value returned may be cpu cycles, milliseconds or any other
+   value, as long as it can be converted to seconds by <time_in_secs>. This
+   methodology is taken to accommodate any hardware or simulated platform. The
+   sample implementation returns millisecs by default, and the resolution is
+   controlled by <TIMER_RES_DIVIDER>
+*/
+CORE_TICKS
+get_time(void)
+{
+    CORE_TICKS elapsed
+        = (CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+    return elapsed;
+}
+/* Function : time_in_secs
+        Convert the value returned by get_time to seconds.
+
+        The <secs_ret> type is used to accommodate systems with no support for
+   floating point. Default implementation implemented by the EE_TICKS_PER_SEC
+   macro above.
+*/
+secs_ret
+time_in_secs(CORE_TICKS ticks)
+{
+    secs_ret retval = ((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+    return retval;
+}
+
+ee_u32 default_num_contexts = 1;
+
+/* Function : portable_init
+        Target specific initialization code
+        Test for some common mistakes.
+*/
+void
+portable_init(core_portable *p, int *argc, char *argv[])
+{
+    //usleep(100);
+    //io.led = 0xF;
+
+//  ee_printf("board: %s (id=%d)\n",board_name(io.board_id),io.board_id);
+    ee_printf("build: %s for %s\n",BUILD,ARCH);
+
+//    ee_printf("core%d: ",              io.core_id);                 // core id
+//    ee_printf("darkriscv@%dMHz with: ",io.board_cm*2);              // board clock MHz
+//    ee_printf("rv32%s ",               check4rv32i()?"i":"e");      // architecture
+    ee_printf("\n");
+//    ee_printf("uart0: 115200 bps (div=%d)\n",io.uart.baud);
+//    ee_printf("timr0: frequency=%dHz (io.timer=%d)\n",(io.board_cm*2000000u)/(io.timer+1),io.timer);
+    
+    ee_printf("\n\n");
+    
+//    ee_printf("CoreMark start in %d us.\n",io.timeus);
+      
+// #error "Call board initialization routines in portable init (if needed), in particular initialize UART!\n"
+    if (sizeof(ee_ptr_int) != sizeof(ee_u8 *))
+    {
+        ee_printf(
+            "ERROR! Please define ee_ptr_int to a type that holds a "
+            "pointer!\n");
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+    }
+    p->portable_id = 1;
+}
+
+
+// Print "fixed point" number (integer/1000)
+void printk(uint64_t kx) {
+    int intpart  = (int)(kx / 1000);
+    int fracpart = (int)(kx % 1000);
+    printf("%d.",intpart);
+    if(fracpart<100) {
+	printf("0");
+    }
+    if(fracpart<10) {
+	printf("0");
+    }
+    printf("%d",fracpart);
+}
+
+
+void print_coremarks(uint64_t ticks) {
+   const uint64_t MHz = CLOCKS_PER_SEC/1000000;
+// printf("*** MHz        : %d\n",(int)MHz);    
+   printf("*** Ticks        : %d\n",(int)ticks);
+   uint64_t ksecs=ticks/(CLOCKS_PER_SEC/1000);
+// printf("*** Time       : "); printk(ksecs); printf("\n");
+   uint64_t kiter_per_sec= (uint64_t)(ITERATIONS*1000*1000)/ksecs;
+// printf("*** Iter/s     : "); printk(kiter_per_sec); printf("\n");
+   printf("*** Coremark/MHz : "); printk(kiter_per_sec/MHz); printf("\n");
+
+   uint64_t kticks2 = rdcycle() * (uint64_t)1000;
+   uint64_t instret2 = rdinstret();
+   printf("*** CPI (2)      : "); printk(kticks2/instret2); printf("\n");
+}
+
+/* Function : portable_fini
+        Target specific final code
+*/
+void
+portable_fini(core_portable *p)
+{
+ //io.led = 0;
+ //ee_printf("CoreMark finish in %d us.\n\n",io.timeus);
+    p->portable_id = 0;
+    
+    // makes no sense return here!
+
+    //while(1)
+    //{
+    //    usleep(500000);
+    //    io.led++;    
+    //}
+}
--- a/FIRMWARE/COREMARK/core_portme.h
+++ b/FIRMWARE/COREMARK/core_portme.h
@@ -0,0 +1,225 @@
+#pragma once
+#include <stdint.h>
+
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#define ITERATIONS 300
+#define BUILD "testbench"
+#define ARCH "petituyau"
+
+#include <stddef.h>
+
+/* Topic : Description
+        This file contains configuration constants required to execute on
+   different platforms
+*/
+#ifndef CORE_PORTME_H
+#define CORE_PORTME_H
+/************************/
+/* Data types and settings */
+/************************/
+/* Configuration : HAS_FLOAT
+        Define to 1 if the platform supports floating point.
+*/
+#ifndef HAS_FLOAT
+#define HAS_FLOAT 0
+#endif
+/* Configuration : HAS_TIME_H
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
+*/
+#ifndef HAS_TIME_H
+#define HAS_TIME_H 0
+#endif
+/* Configuration : USE_CLOCK
+        Define to 1 if platform has the time.h header file,
+        and implementation of functions thereof.
+*/
+#ifndef USE_CLOCK
+#define USE_CLOCK 0
+#endif
+/* Configuration : HAS_STDIO
+        Define to 1 if the platform has stdio.h.
+*/
+#ifndef HAS_STDIO
+#define HAS_STDIO 0
+#endif
+/* Configuration : HAS_PRINTF
+        Define to 1 if the platform has stdio.h and implements the printf
+   function.
+*/
+#ifndef HAS_PRINTF
+#define HAS_PRINTF 0
+#endif
+
+/* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
+        Initialize these strings per platform
+*/
+#ifndef COMPILER_VERSION
+#ifdef __GNUC__
+#define COMPILER_VERSION "GCC"__VERSION__
+#else
+#define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
+#endif
+#endif
+#ifndef COMPILER_FLAGS
+#define COMPILER_FLAGS "-O2"
+#endif
+#ifndef MEM_LOCATION
+#define MEM_LOCATION "STACK"
+#endif
+
+/* Data Types :
+        To avoid compiler issues, define the data types that need ot be used for
+   8b, 16b and 32b in <core_portme.h>.
+
+        *Imprtant* :
+        ee_ptr_int needs to be the data type used to hold pointers, otherwise
+   coremark may fail!!!
+*/
+typedef signed short   ee_s16;
+typedef unsigned short ee_u16;
+typedef signed int     ee_s32;
+typedef double         ee_f32;
+typedef unsigned char  ee_u8;
+typedef unsigned int   ee_u32;
+typedef ee_u32         ee_ptr_int;
+typedef size_t         ee_size_t;
+#define NULL ((void *)0)
+/* align_mem :
+        This macro is used to align an offset to point to a 32b value. It is
+   used in the Matrix algorithm to initialize the input memory blocks.
+*/
+#define align_mem(x) (void *)(4 + (((ee_ptr_int)(x)-1) & ~3))
+
+/* Configuration : CORE_TICKS
+        Define type of return from the timing functions.
+ */
+//#define CORETIMETYPE ee_u32
+//typedef ee_u32 CORE_TICKS;
+
+#define CORETIMETYPE uint64_t
+typedef uint64_t CORE_TICKS;
+
+
+/* Configuration : SEED_METHOD
+        Defines method to get seed values that cannot be computed at compile
+   time.
+
+        Valid values :
+        SEED_ARG - from command line.
+        SEED_FUNC - from a system function.
+        SEED_VOLATILE - from volatile variables.
+*/
+#ifndef SEED_METHOD
+#define SEED_METHOD SEED_VOLATILE
+#endif
+
+/* Configuration : MEM_METHOD
+        Defines method to get a block of memry.
+
+        Valid values :
+        MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+        MEM_STATIC - to use a static memory array.
+        MEM_STACK - to allocate the data block on the stack (NYI).
+*/
+#ifndef MEM_METHOD
+#define MEM_METHOD MEM_STACK
+#endif
+
+/* Configuration : MULTITHREAD
+        Define for parallel execution
+
+        Valid values :
+        1 - only one context (default).
+        N>1 - will execute N copies in parallel.
+
+        Note :
+        If this flag is defined to more then 1, an implementation for launching
+   parallel contexts must be defined.
+
+        Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK>
+   to enable them.
+
+        It is valid to have a different implementation of <core_start_parallel>
+   and <core_end_parallel> in <core_portme.c>, to fit a particular architecture.
+*/
+#ifndef MULTITHREAD
+#define MULTITHREAD 1
+#define USE_PTHREAD 0
+#define USE_FORK    0
+#define USE_SOCKET  0
+#endif
+
+/* Configuration : MAIN_HAS_NOARGC
+        Needed if platform does not support getting arguments to main.
+
+        Valid values :
+        0 - argc/argv to main is supported
+        1 - argc/argv to main is not supported
+
+        Note :
+        This flag only matters if MULTITHREAD has been defined to a value
+   greater then 1.
+*/
+#ifndef MAIN_HAS_NOARGC
+#define MAIN_HAS_NOARGC 1
+#endif
+
+/* Configuration : MAIN_HAS_NORETURN
+        Needed if platform does not support returning a value from main.
+
+        Valid values :
+        0 - main returns an int, and return value will be 0.
+        1 - platform does not support returning a value from main
+*/
+#ifndef MAIN_HAS_NORETURN
+#define MAIN_HAS_NORETURN 0
+#endif
+
+/* Variable : default_num_contexts
+        Not used for this simple port, must contain the value 1.
+*/
+extern ee_u32 default_num_contexts;
+
+typedef struct CORE_PORTABLE_S
+{
+    ee_u8 portable_id;
+} core_portable;
+
+/* target specific init/fini */
+void portable_init(core_portable *p, int *argc, char *argv[]);
+void portable_fini(core_portable *p);
+
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) \
+    && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE == 1200)
+#define PROFILE_RUN 1
+#elif (TOTAL_DATA_SIZE == 2000)
+#define PERFORMANCE_RUN 1
+#else
+#define VALIDATION_RUN 1
+#endif
+#endif
+
+int ee_printf(const char *fmt, ...);
+void print_coremarks(uint64_t ticks);
+
+#endif /* CORE_PORTME_H */
+
--- a/FIRMWARE/COREMARK/core_state.c
+++ b/FIRMWARE/COREMARK/core_state.c
@@ -0,0 +1,330 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/* local functions */
+enum CORE_STATE core_state_transition(ee_u8 **instr, ee_u32 *transition_count);
+
+/*
+Topic: Description
+        Simple state machines like this one are used in many embedded products.
+
+        For more complex state machines, sometimes a state transition table
+implementation is used instead, trading speed of direct coding for ease of
+maintenance.
+
+        Since the main goal of using a state machine in CoreMark is to excercise
+the switch/if behaviour, we are using a small moore machine.
+
+        In particular, this machine tests type of string input,
+        trying to determine whether the input is a number or something else.
+        (see core_state.png).
+*/
+
+/* Function: core_bench_state
+        Benchmark function
+
+        Go over the input twice, once direct, and once after introducing some
+   corruption.
+*/
+ee_u16
+core_bench_state(ee_u32 blksize,
+                 ee_u8 *memblock,
+                 ee_s16 seed1,
+                 ee_s16 seed2,
+                 ee_s16 step,
+                 ee_u16 crc)
+{
+    ee_u32 final_counts[NUM_CORE_STATES];
+    ee_u32 track_counts[NUM_CORE_STATES];
+    ee_u8 *p = memblock;
+    ee_u32 i;
+
+#if CORE_DEBUG
+    ee_printf("State Bench: %d,%d,%d,%04x\n", seed1, seed2, step, crc);
+#endif
+    for (i = 0; i < NUM_CORE_STATES; i++)
+    {
+        final_counts[i] = track_counts[i] = 0;
+    }
+    /* run the state machine over the input */
+    while (*p != 0)
+    {
+        enum CORE_STATE fstate = core_state_transition(&p, track_counts);
+        final_counts[fstate]++;
+#if CORE_DEBUG
+        ee_printf("%d,", fstate);
+    }
+    ee_printf("\n");
+#else
+    }
+#endif
+    p = memblock;
+    while (p < (memblock + blksize))
+    { /* insert some corruption */
+        if (*p != ',')
+            *p ^= (ee_u8)seed1;
+        p += step;
+    }
+    p = memblock;
+    /* run the state machine over the input again */
+    while (*p != 0)
+    {
+        enum CORE_STATE fstate = core_state_transition(&p, track_counts);
+        final_counts[fstate]++;
+#if CORE_DEBUG
+        ee_printf("%d,", fstate);
+    }
+    ee_printf("\n");
+#else
+    }
+#endif
+    p = memblock;
+    while (p < (memblock + blksize))
+    { /* undo corruption is seed1 and seed2 are equal */
+        if (*p != ',')
+            *p ^= (ee_u8)seed2;
+        p += step;
+    }
+    /* end timing */
+    for (i = 0; i < NUM_CORE_STATES; i++)
+    {
+        crc = crcu32(final_counts[i], crc);
+        crc = crcu32(track_counts[i], crc);
+    }
+    return crc;
+}
+
+/* Default initialization patterns */
+static ee_u8 *intpat[4]
+    = { (ee_u8 *)"5012", (ee_u8 *)"1234", (ee_u8 *)"-874", (ee_u8 *)"+122" };
+static ee_u8 *floatpat[4] = { (ee_u8 *)"35.54400",
+                              (ee_u8 *)".1234500",
+                              (ee_u8 *)"-110.700",
+                              (ee_u8 *)"+0.64400" };
+static ee_u8 *scipat[4]   = { (ee_u8 *)"5.500e+3",
+                            (ee_u8 *)"-.123e-2",
+                            (ee_u8 *)"-87e+832",
+                            (ee_u8 *)"+0.6e-12" };
+static ee_u8 *errpat[4]   = { (ee_u8 *)"T0.3e-1F",
+                            (ee_u8 *)"-T.T++Tq",
+                            (ee_u8 *)"1T3.4e4z",
+                            (ee_u8 *)"34.0e-T^" };
+
+/* Function: core_init_state
+        Initialize the input data for the state machine.
+
+        Populate the input with several predetermined strings, interspersed.
+        Actual patterns chosen depend on the seed parameter.
+
+        Note:
+        The seed parameter MUST be supplied from a source that cannot be
+   determined at compile time
+*/
+void
+core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p)
+{
+    ee_u32 total = 0, next = 0, i;
+    ee_u8 *buf = 0;
+#if CORE_DEBUG
+    ee_u8 *start = p;
+    ee_printf("State: %d,%d\n", size, seed);
+#endif
+    size--;
+    next = 0;
+    while ((total + next + 1) < size)
+    {
+        if (next > 0)
+        {
+            for (i = 0; i < next; i++)
+                *(p + total + i) = buf[i];
+            *(p + total + i) = ',';
+            total += next + 1;
+        }
+        seed++;
+        switch (seed & 0x7)
+        {
+            case 0: /* int */
+            case 1: /* int */
+            case 2: /* int */
+                buf  = intpat[(seed >> 3) & 0x3];
+                next = 4;
+                break;
+            case 3: /* float */
+            case 4: /* float */
+                buf  = floatpat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            case 5: /* scientific */
+            case 6: /* scientific */
+                buf  = scipat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            case 7: /* invalid */
+                buf  = errpat[(seed >> 3) & 0x3];
+                next = 8;
+                break;
+            default: /* Never happen, just to make some compilers happy */
+                break;
+        }
+    }
+    size++;
+    while (total < size)
+    { /* fill the rest with 0 */
+        *(p + total) = 0;
+        total++;
+    }
+#if CORE_DEBUG
+    ee_printf("State Input: %s\n", start);
+#endif
+}
+
+static ee_u8
+ee_isdigit(ee_u8 c)
+{
+    ee_u8 retval;
+    retval = ((c >= '0') & (c <= '9')) ? 1 : 0;
+    return retval;
+}
+
+/* Function: core_state_transition
+        Actual state machine.
+
+        The state machine will continue scanning until either:
+        1 - an invalid input is detected.
+        2 - a valid number has been detected.
+
+        The input pointer is updated to point to the end of the token, and the
+   end state is returned (either specific format determined or invalid).
+*/
+
+enum CORE_STATE
+core_state_transition(ee_u8 **instr, ee_u32 *transition_count)
+{
+    ee_u8 *         str = *instr;
+    ee_u8           NEXT_SYMBOL;
+    enum CORE_STATE state = CORE_START;
+    for (; *str && state != CORE_INVALID; str++)
+    {
+        NEXT_SYMBOL = *str;
+        if (NEXT_SYMBOL == ',') /* end of this input */
+        {
+            str++;
+            break;
+        }
+        switch (state)
+        {
+            case CORE_START:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INT;
+                }
+                else if (NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-')
+                {
+                    state = CORE_S1;
+                }
+                else if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INVALID]++;
+                }
+                transition_count[CORE_START]++;
+                break;
+            case CORE_S1:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INT;
+                    transition_count[CORE_S1]++;
+                }
+                else if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                    transition_count[CORE_S1]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_S1]++;
+                }
+                break;
+            case CORE_INT:
+                if (NEXT_SYMBOL == '.')
+                {
+                    state = CORE_FLOAT;
+                    transition_count[CORE_INT]++;
+                }
+                else if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INT]++;
+                }
+                break;
+            case CORE_FLOAT:
+                if (NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e')
+                {
+                    state = CORE_S2;
+                    transition_count[CORE_FLOAT]++;
+                }
+                else if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_FLOAT]++;
+                }
+                break;
+            case CORE_S2:
+                if (NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-')
+                {
+                    state = CORE_EXPONENT;
+                    transition_count[CORE_S2]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_S2]++;
+                }
+                break;
+            case CORE_EXPONENT:
+                if (ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_SCIENTIFIC;
+                    transition_count[CORE_EXPONENT]++;
+                }
+                else
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_EXPONENT]++;
+                }
+                break;
+            case CORE_SCIENTIFIC:
+                if (!ee_isdigit(NEXT_SYMBOL))
+                {
+                    state = CORE_INVALID;
+                    transition_count[CORE_INVALID]++;
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    *instr = str;
+    return state;
+}
--- a/FIRMWARE/COREMARK/core_util.c
+++ b/FIRMWARE/COREMARK/core_util.c
@@ -0,0 +1,249 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+#include "coremark.h"
+/* Function: get_seed
+        Get a values that cannot be determined at compile time.
+
+        Since different embedded systems and compilers are used, 3 different
+   methods are provided: 1 - Using a volatile variable. This method is only
+   valid if the compiler is forced to generate code that reads the value of a
+   volatile variable from memory at run time. Please note, if using this method,
+   you would need to modify core_portme.c to generate training profile. 2 -
+   Command line arguments. This is the preferred method if command line
+   arguments are supported. 3 - System function. If none of the first 2 methods
+   is available on the platform, a system function which is not a stub can be
+   used.
+
+        e.g. read the value on GPIO pins connected to switches, or invoke
+   special simulator functions.
+*/
+#if (SEED_METHOD == SEED_VOLATILE)
+extern volatile ee_s32 seed1_volatile;
+extern volatile ee_s32 seed2_volatile;
+extern volatile ee_s32 seed3_volatile;
+extern volatile ee_s32 seed4_volatile;
+extern volatile ee_s32 seed5_volatile;
+ee_s32
+get_seed_32(int i)
+{
+    ee_s32 retval;
+    switch (i)
+    {
+        case 1:
+            retval = seed1_volatile;
+            break;
+        case 2:
+            retval = seed2_volatile;
+            break;
+        case 3:
+            retval = seed3_volatile;
+            break;
+        case 4:
+            retval = seed4_volatile;
+            break;
+        case 5:
+            retval = seed5_volatile;
+            break;
+        default:
+            retval = 0;
+            break;
+    }
+    return retval;
+}
+#elif (SEED_METHOD == SEED_ARG)
+ee_s32
+parseval(char *valstring)
+{
+    ee_s32 retval  = 0;
+    ee_s32 neg     = 1;
+    int    hexmode = 0;
+    if (*valstring == '-')
+    {
+        neg = -1;
+        valstring++;
+    }
+    if ((valstring[0] == '0') && (valstring[1] == 'x'))
+    {
+        hexmode = 1;
+        valstring += 2;
+    }
+    /* first look for digits */
+    if (hexmode)
+    {
+        while (((*valstring >= '0') && (*valstring <= '9'))
+               || ((*valstring >= 'a') && (*valstring <= 'f')))
+        {
+            ee_s32 digit = *valstring - '0';
+            if (digit > 9)
+                digit = 10 + *valstring - 'a';
+            retval *= 16;
+            retval += digit;
+            valstring++;
+        }
+    }
+    else
+    {
+        while ((*valstring >= '0') && (*valstring <= '9'))
+        {
+            ee_s32 digit = *valstring - '0';
+            retval *= 10;
+            retval += digit;
+            valstring++;
+        }
+    }
+    /* now add qualifiers */
+    if (*valstring == 'K')
+        retval *= 1024;
+    if (*valstring == 'M')
+        retval *= 1024 * 1024;
+
+    retval *= neg;
+    return retval;
+}
+
+ee_s32
+get_seed_args(int i, int argc, char *argv[])
+{
+    if (argc > i)
+        return parseval(argv[i]);
+    return 0;
+}
+
+#elif (SEED_METHOD == SEED_FUNC)
+/* If using OS based function, you must define and implement the functions below
+ * in core_portme.h and core_portme.c ! */
+ee_s32
+get_seed_32(int i)
+{
+    ee_s32 retval;
+    switch (i)
+    {
+        case 1:
+            retval = portme_sys1();
+            break;
+        case 2:
+            retval = portme_sys2();
+            break;
+        case 3:
+            retval = portme_sys3();
+            break;
+        case 4:
+            retval = portme_sys4();
+            break;
+        case 5:
+            retval = portme_sys5();
+            break;
+        default:
+            retval = 0;
+            break;
+    }
+    return retval;
+}
+#endif
+
+/* Function: crc*
+        Service functions to calculate 16b CRC code.
+
+*/
+ee_u16
+crcu8(ee_u8 data, ee_u16 crc)
+{
+    ee_u8 i = 0, x16 = 0, carry = 0;
+
+    for (i = 0; i < 8; i++)
+    {
+        x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+        data >>= 1;
+
+        if (x16 == 1)
+        {
+            crc ^= 0x4002;
+            carry = 1;
+        }
+        else
+            carry = 0;
+        crc >>= 1;
+        if (carry)
+            crc |= 0x8000;
+        else
+            crc &= 0x7fff;
+    }
+    return crc;
+}
+ee_u16
+crcu16(ee_u16 newval, ee_u16 crc)
+{
+    crc = crcu8((ee_u8)(newval), crc);
+    crc = crcu8((ee_u8)((newval) >> 8), crc);
+    return crc;
+}
+ee_u16
+crcu32(ee_u32 newval, ee_u16 crc)
+{
+    crc = crc16((ee_s16)newval, crc);
+    crc = crc16((ee_s16)(newval >> 16), crc);
+    return crc;
+}
+ee_u16
+crc16(ee_s16 newval, ee_u16 crc)
+{
+    return crcu16((ee_u16)newval, crc);
+}
+
+ee_u8
+check_data_types()
+{
+    ee_u8 retval = 0;
+    if (sizeof(ee_u8) != 1)
+    {
+        ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_u16) != 2)
+    {
+        ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_s16) != 2)
+    {
+        ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_s32) != 4)
+    {
+        ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_u32) != 4)
+    {
+        ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
+        retval++;
+    }
+    if (sizeof(ee_ptr_int) != sizeof(int *))
+    {
+        ee_printf(
+            "ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
+        retval++;
+    }
+    if (retval > 0)
+    {
+        ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
+    }
+    return retval;
+}
--- a/FIRMWARE/COREMARK/coremark.h
+++ b/FIRMWARE/COREMARK/coremark.h
@@ -0,0 +1,184 @@
+#pragma once
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Original Author: Shay Gal-on
+*/
+
+/* Topic: Description
+        This file contains  declarations of the various benchmark functions.
+*/
+
+/* Configuration: TOTAL_DATA_SIZE
+        Define total size for data algorithms will operate on
+*/
+#ifndef TOTAL_DATA_SIZE
+#define TOTAL_DATA_SIZE 2 * 1000
+#endif
+
+#define SEED_ARG      0
+#define SEED_FUNC     1
+#define SEED_VOLATILE 2
+
+#define MEM_STATIC 0
+#define MEM_MALLOC 1
+#define MEM_STACK  2
+
+#include "core_portme.h"
+
+#if HAS_STDIO
+#include <stdio.h>
+#endif
+#if HAS_PRINTF
+#define ee_printf printf
+#endif
+
+/* Actual benchmark execution in iterate */
+void *iterate(void *pres);
+
+/* Typedef: secs_ret
+        For machines that have floating point support, get number of seconds as
+   a double. Otherwise an unsigned int.
+*/
+#if HAS_FLOAT
+typedef double secs_ret;
+#else
+typedef ee_u32 secs_ret;
+#endif
+
+#if MAIN_HAS_NORETURN
+#define MAIN_RETURN_VAL
+#define MAIN_RETURN_TYPE void
+#else
+#define MAIN_RETURN_VAL  0
+#define MAIN_RETURN_TYPE int
+#endif
+
+void       start_time(void);
+void       stop_time(void);
+CORE_TICKS get_time(void);
+secs_ret   time_in_secs(CORE_TICKS ticks);
+
+/* Misc useful functions */
+ee_u16 crcu8(ee_u8 data, ee_u16 crc);
+ee_u16 crc16(ee_s16 newval, ee_u16 crc);
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
+ee_u8  check_data_types(void);
+void * portable_malloc(ee_size_t size);
+void   portable_free(void *p);
+ee_s32 parseval(char *valstring);
+
+/* Algorithm IDS */
+#define ID_LIST             (1 << 0)
+#define ID_MATRIX           (1 << 1)
+#define ID_STATE            (1 << 2)
+#define ALL_ALGORITHMS_MASK (ID_LIST | ID_MATRIX | ID_STATE)
+#define NUM_ALGORITHMS      3
+
+/* list data structures */
+typedef struct list_data_s
+{
+    ee_s16 data16;
+    ee_s16 idx;
+} list_data;
+
+typedef struct list_head_s
+{
+    struct list_head_s *next;
+    struct list_data_s *info;
+} list_head;
+
+/*matrix benchmark related stuff */
+#define MATDAT_INT 1
+#if MATDAT_INT
+typedef ee_s16 MATDAT;
+typedef ee_s32 MATRES;
+#else
+typedef ee_f16 MATDAT;
+typedef ee_f32 MATRES;
+#endif
+
+typedef struct MAT_PARAMS_S
+{
+    int     N;
+    MATDAT *A;
+    MATDAT *B;
+    MATRES *C;
+} mat_params;
+
+/* state machine related stuff */
+/* List of all the possible states for the FSM */
+typedef enum CORE_STATE
+{
+    CORE_START = 0,
+    CORE_INVALID,
+    CORE_S1,
+    CORE_S2,
+    CORE_INT,
+    CORE_FLOAT,
+    CORE_EXPONENT,
+    CORE_SCIENTIFIC,
+    NUM_CORE_STATES
+} core_state_e;
+
+/* Helper structure to hold results */
+typedef struct RESULTS_S
+{
+    /* inputs */
+    ee_s16              seed1;       /* Initializing seed */
+    ee_s16              seed2;       /* Initializing seed */
+    ee_s16              seed3;       /* Initializing seed */
+    void *              memblock[4]; /* Pointer to safe memory location */
+    ee_u32              size;        /* Size of the data */
+    ee_u32              iterations;  /* Number of iterations to execute */
+    ee_u32              execs;       /* Bitmask of operations to execute */
+    struct list_head_s *list;
+    mat_params          mat;
+    /* outputs */
+    ee_u16 crc;
+    ee_u16 crclist;
+    ee_u16 crcmatrix;
+    ee_u16 crcstate;
+    ee_s16 err;
+    /* ultithread specific */
+    core_portable port;
+} core_results;
+
+/* Multicore execution handling */
+#if (MULTITHREAD > 1)
+ee_u8 core_start_parallel(core_results *res);
+ee_u8 core_stop_parallel(core_results *res);
+#endif
+
+/* list benchmark functions */
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
+ee_u16     core_bench_list(core_results *res, ee_s16 finder_idx);
+
+/* state benchmark functions */
+void   core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
+ee_u16 core_bench_state(ee_u32 blksize,
+                        ee_u8 *memblock,
+                        ee_s16 seed1,
+                        ee_s16 seed2,
+                        ee_s16 step,
+                        ee_u16 crc);
+
+/* matrix benchmark functions */
+ee_u32 core_init_matrix(ee_u32      blksize,
+                        void *      memblk,
+                        ee_s32      seed,
+                        mat_params *p);
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
--- a/FIRMWARE/COREMARK/ee_printf.c
+++ b/FIRMWARE/COREMARK/ee_printf.c
@@ -0,0 +1,712 @@
+/*
+Copyright 2018 Embedded Microprocessor Benchmark Consortium (EEMBC)
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <io.h>
+#include "coremark.h"
+#include <stdarg.h>
+
+#define ZEROPAD   (1 << 0) /* Pad with zero */
+#define SIGN      (1 << 1) /* Unsigned/signed long */
+#define PLUS      (1 << 2) /* Show plus */
+#define SPACE     (1 << 3) /* Spacer */
+#define LEFT      (1 << 4) /* Left justified */
+#define HEX_PREP  (1 << 5) /* 0x */
+#define UPPERCASE (1 << 6) /* 'ABCDEF' */
+
+#define is_digit(c) ((c) >= '0' && (c) <= '9')
+
+static char *    digits       = "0123456789abcdefghijklmnopqrstuvwxyz";
+static char *    upper_digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+static ee_size_t strnlen(const char *s, ee_size_t count);
+
+static ee_size_t
+strnlen(const char *s, ee_size_t count)
+{
+    const char *sc;
+    for (sc = s; *sc != '\0' && count--; ++sc)
+        ;
+    return sc - s;
+}
+
+static int
+skip_atoi(const char **s)
+{
+    int i = 0;
+    while (is_digit(**s))
+        i = i * 10 + *((*s)++) - '0';
+    return i;
+}
+
+static char *
+number(char *str, long num, int base, int size, int precision, int type)
+{
+    char  c, sign, tmp[66];
+    char *dig = digits;
+    int   i;
+
+    if (type & UPPERCASE)
+        dig = upper_digits;
+    if (type & LEFT)
+        type &= ~ZEROPAD;
+    if (base < 2 || base > 36)
+        return 0;
+
+    c    = (type & ZEROPAD) ? '0' : ' ';
+    sign = 0;
+    if (type & SIGN)
+    {
+        if (num < 0)
+        {
+            sign = '-';
+            num  = -num;
+            size--;
+        }
+        else if (type & PLUS)
+        {
+            sign = '+';
+            size--;
+        }
+        else if (type & SPACE)
+        {
+            sign = ' ';
+            size--;
+        }
+    }
+
+    if (type & HEX_PREP)
+    {
+        if (base == 16)
+            size -= 2;
+        else if (base == 8)
+            size--;
+    }
+
+    i = 0;
+
+    if (num == 0)
+        tmp[i++] = '0';
+    else
+    {
+        while (num != 0)
+        {
+            tmp[i++] = dig[((unsigned long)num) % (unsigned)base];
+            num      = ((unsigned long)num) / (unsigned)base;
+        }
+    }
+
+    if (i > precision)
+        precision = i;
+    size -= precision;
+    if (!(type & (ZEROPAD | LEFT)))
+        while (size-- > 0)
+            *str++ = ' ';
+    if (sign)
+        *str++ = sign;
+
+    if (type & HEX_PREP)
+    {
+        if (base == 8)
+            *str++ = '0';
+        else if (base == 16)
+        {
+            *str++ = '0';
+            *str++ = digits[33];
+        }
+    }
+
+    if (!(type & LEFT))
+        while (size-- > 0)
+            *str++ = c;
+    while (i < precision--)
+        *str++ = '0';
+    while (i-- > 0)
+        *str++ = tmp[i];
+    while (size-- > 0)
+        *str++ = ' ';
+
+    return str;
+}
+
+static char *
+eaddr(char *str, unsigned char *addr, int size, int precision, int type)
+{
+    char  tmp[24];
+    char *dig = digits;
+    int   i, len;
+
+    if (type & UPPERCASE)
+        dig = upper_digits;
+    len = 0;
+    for (i = 0; i < 6; i++)
+    {
+        if (i != 0)
+            tmp[len++] = ':';
+        tmp[len++] = dig[addr[i] >> 4];
+        tmp[len++] = dig[addr[i] & 0x0F];
+    }
+
+    if (!(type & LEFT))
+        while (len < size--)
+            *str++ = ' ';
+    for (i = 0; i < len; ++i)
+        *str++ = tmp[i];
+    while (len < size--)
+        *str++ = ' ';
+
+    return str;
+}
+
+static char *
+iaddr(char *str, unsigned char *addr, int size, int precision, int type)
+{
+    char tmp[24];
+    int  i, n, len;
+
+    len = 0;
+    for (i = 0; i < 4; i++)
+    {
+        if (i != 0)
+            tmp[len++] = '.';
+        n = addr[i];
+
+        if (n == 0)
+            tmp[len++] = digits[0];
+        else
+        {
+            if (n >= 100)
+            {
+                tmp[len++] = digits[n / 100];
+                n          = n % 100;
+                tmp[len++] = digits[n / 10];
+                n          = n % 10;
+            }
+            else if (n >= 10)
+            {
+                tmp[len++] = digits[n / 10];
+                n          = n % 10;
+            }
+
+            tmp[len++] = digits[n];
+        }
+    }
+
+    if (!(type & LEFT))
+        while (len < size--)
+            *str++ = ' ';
+    for (i = 0; i < len; ++i)
+        *str++ = tmp[i];
+    while (len < size--)
+        *str++ = ' ';
+
+    return str;
+}
+
+#if HAS_FLOAT
+
+char *      ecvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf);
+char *      fcvtbuf(double arg, int ndigits, int *decpt, int *sign, char *buf);
+static void ee_bufcpy(char *d, char *s, int count);
+
+void
+ee_bufcpy(char *pd, char *ps, int count)
+{
+    char *pe = ps + count;
+    while (ps != pe)
+        *pd++ = *ps++;
+}
+
+static void
+parse_float(double value, char *buffer, char fmt, int precision)
+{
+    int   decpt, sign, exp, pos;
+    char *digits = NULL;
+    char  cvtbuf[80];
+    int   capexp = 0;
+    int   magnitude;
+
+    if (fmt == 'G' || fmt == 'E')
+    {
+        capexp = 1;
+        fmt += 'a' - 'A';
+    }
+
+    if (fmt == 'g')
+    {
+        digits    = ecvtbuf(value, precision, &decpt, &sign, cvtbuf);
+        magnitude = decpt - 1;
+        if (magnitude < -4 || magnitude > precision - 1)
+        {
+            fmt = 'e';
+            precision -= 1;
+        }
+        else
+        {
+            fmt = 'f';
+            precision -= decpt;
+        }
+    }
+
+    if (fmt == 'e')
+    {
+        digits = ecvtbuf(value, precision + 1, &decpt, &sign, cvtbuf);
+
+        if (sign)
+            *buffer++ = '-';
+        *buffer++ = *digits;
+        if (precision > 0)
+            *buffer++ = '.';
+        ee_bufcpy(buffer, digits + 1, precision);
+        buffer += precision;
+        *buffer++ = capexp ? 'E' : 'e';
+
+        if (decpt == 0)
+        {
+            if (value == 0.0)
+                exp = 0;
+            else
+                exp = -1;
+        }
+        else
+            exp = decpt - 1;
+
+        if (exp < 0)
+        {
+            *buffer++ = '-';
+            exp       = -exp;
+        }
+        else
+            *buffer++ = '+';
+
+        buffer[2] = (exp % 10) + '0';
+        exp       = exp / 10;
+        buffer[1] = (exp % 10) + '0';
+        exp       = exp / 10;
+        buffer[0] = (exp % 10) + '0';
+        buffer += 3;
+    }
+    else if (fmt == 'f')
+    {
+        digits = fcvtbuf(value, precision, &decpt, &sign, cvtbuf);
+        if (sign)
+            *buffer++ = '-';
+        if (*digits)
+        {
+            if (decpt <= 0)
+            {
+                *buffer++ = '0';
+                *buffer++ = '.';
+                for (pos = 0; pos < -decpt; pos++)
+                    *buffer++ = '0';
+                while (*digits)
+                    *buffer++ = *digits++;
+            }
+            else
+            {
+                pos = 0;
+                while (*digits)
+                {
+                    if (pos++ == decpt)
+                        *buffer++ = '.';
+                    *buffer++ = *digits++;
+                }
+            }
+        }
+        else
+        {
+            *buffer++ = '0';
+            if (precision > 0)
+            {
+                *buffer++ = '.';
+                for (pos = 0; pos < precision; pos++)
+                    *buffer++ = '0';
+            }
+        }
+    }
+
+    *buffer = '\0';
+}
+
+static void
+decimal_point(char *buffer)
+{
+    while (*buffer)
+    {
+        if (*buffer == '.')
+            return;
+        if (*buffer == 'e' || *buffer == 'E')
+            break;
+        buffer++;
+    }
+
+    if (*buffer)
+    {
+        int n = strnlen(buffer, 256);
+        while (n > 0)
+        {
+            buffer[n + 1] = buffer[n];
+            n--;
+        }
+
+        *buffer = '.';
+    }
+    else
+    {
+        *buffer++ = '.';
+        *buffer   = '\0';
+    }
+}
+
+static void
+cropzeros(char *buffer)
+{
+    char *stop;
+
+    while (*buffer && *buffer != '.')
+        buffer++;
+    if (*buffer++)
+    {
+        while (*buffer && *buffer != 'e' && *buffer != 'E')
+            buffer++;
+        stop = buffer--;
+        while (*buffer == '0')
+            buffer--;
+        if (*buffer == '.')
+            buffer--;
+        while (buffer != stop)
+            *++buffer = 0;
+    }
+}
+
+static char *
+flt(char *str, double num, int size, int precision, char fmt, int flags)
+{
+    char tmp[80];
+    char c, sign;
+    int  n, i;
+
+    // Left align means no zero padding
+    if (flags & LEFT)
+        flags &= ~ZEROPAD;
+
+    // Determine padding and sign char
+    c    = (flags & ZEROPAD) ? '0' : ' ';
+    sign = 0;
+    if (flags & SIGN)
+    {
+        if (num < 0.0)
+        {
+            sign = '-';
+            num  = -num;
+            size--;
+        }
+        else if (flags & PLUS)
+        {
+            sign = '+';
+            size--;
+        }
+        else if (flags & SPACE)
+        {
+            sign = ' ';
+            size--;
+        }
+    }
+
+    // Compute the precision value
+    if (precision < 0)
+        precision = 6; // Default precision: 6
+
+    // Convert floating point number to text
+    parse_float(num, tmp, fmt, precision);
+
+    if ((flags & HEX_PREP) && precision == 0)
+        decimal_point(tmp);
+    if (fmt == 'g' && !(flags & HEX_PREP))
+        cropzeros(tmp);
+
+    n = strnlen(tmp, 256);
+
+    // Output number with alignment and padding
+    size -= n;
+    if (!(flags & (ZEROPAD | LEFT)))
+        while (size-- > 0)
+            *str++ = ' ';
+    if (sign)
+        *str++ = sign;
+    if (!(flags & LEFT))
+        while (size-- > 0)
+            *str++ = c;
+    for (i = 0; i < n; i++)
+        *str++ = tmp[i];
+    while (size-- > 0)
+        *str++ = ' ';
+
+    return str;
+}
+
+#endif
+
+static int
+ee_vsprintf(char *buf, const char *fmt, va_list args)
+{
+    int           len;
+    unsigned long num;
+    int           i, base;
+    char *        str;
+    char *        s;
+
+    int flags; // Flags to number()
+
+    int field_width; // Width of output field
+    int precision;   // Min. # of digits for integers; max number of chars for
+                     // from string
+    int qualifier;   // 'h', 'l', or 'L' for integer fields
+
+    for (str = buf; *fmt; fmt++)
+    {
+        if (*fmt != '%')
+        {
+            *str++ = *fmt;
+            continue;
+        }
+
+        // Process flags
+        flags = 0;
+    repeat:
+        fmt++; // This also skips first '%'
+        switch (*fmt)
+        {
+            case '-':
+                flags |= LEFT;
+                goto repeat;
+            case '+':
+                flags |= PLUS;
+                goto repeat;
+            case ' ':
+                flags |= SPACE;
+                goto repeat;
+            case '#':
+                flags |= HEX_PREP;
+                goto repeat;
+            case '0':
+                flags |= ZEROPAD;
+                goto repeat;
+        }
+
+        // Get field width
+        field_width = -1;
+        if (is_digit(*fmt))
+            field_width = skip_atoi(&fmt);
+        else if (*fmt == '*')
+        {
+            fmt++;
+            field_width = va_arg(args, int);
+            if (field_width < 0)
+            {
+                field_width = -field_width;
+                flags |= LEFT;
+            }
+        }
+
+        // Get the precision
+        precision = -1;
+        if (*fmt == '.')
+        {
+            ++fmt;
+            if (is_digit(*fmt))
+                precision = skip_atoi(&fmt);
+            else if (*fmt == '*')
+            {
+                ++fmt;
+                precision = va_arg(args, int);
+            }
+            if (precision < 0)
+                precision = 0;
+        }
+
+        // Get the conversion qualifier
+        qualifier = -1;
+        if (*fmt == 'l' || *fmt == 'L')
+        {
+            qualifier = *fmt;
+            fmt++;
+        }
+
+        // Default base
+        base = 10;
+
+        switch (*fmt)
+        {
+            case 'c':
+                if (!(flags & LEFT))
+                    while (--field_width > 0)
+                        *str++ = ' ';
+                *str++ = (unsigned char)va_arg(args, int);
+                while (--field_width > 0)
+                    *str++ = ' ';
+                continue;
+
+            case 's':
+                s = va_arg(args, char *);
+                if (!s)
+                    s = "<NULL>";
+                len = strnlen(s, precision);
+                if (!(flags & LEFT))
+                    while (len < field_width--)
+                        *str++ = ' ';
+                for (i = 0; i < len; ++i)
+                    *str++ = *s++;
+                while (len < field_width--)
+                    *str++ = ' ';
+                continue;
+
+            case 'p':
+                if (field_width == -1)
+                {
+                    field_width = 2 * sizeof(void *);
+                    flags |= ZEROPAD;
+                }
+                str = number(str,
+                             (unsigned long)va_arg(args, void *),
+                             16,
+                             field_width,
+                             precision,
+                             flags);
+                continue;
+
+            case 'A':
+                flags |= UPPERCASE;
+
+            case 'a':
+                if (qualifier == 'l')
+                    str = eaddr(str,
+                                va_arg(args, unsigned char *),
+                                field_width,
+                                precision,
+                                flags);
+                else
+                    str = iaddr(str,
+                                va_arg(args, unsigned char *),
+                                field_width,
+                                precision,
+                                flags);
+                continue;
+
+            // Integer number formats - set up the flags and "break"
+            case 'o':
+                base = 8;
+                break;
+
+            case 'X':
+                flags |= UPPERCASE;
+
+            case 'x':
+                base = 16;
+                break;
+
+            case 'd':
+            case 'i':
+                flags |= SIGN;
+
+            case 'u':
+                break;
+
+#if HAS_FLOAT
+
+            case 'f':
+                str = flt(str,
+                          va_arg(args, double),
+                          field_width,
+                          precision,
+                          *fmt,
+                          flags | SIGN);
+                continue;
+
+#endif
+
+            default:
+                if (*fmt != '%')
+                    *str++ = '%';
+                if (*fmt)
+                    *str++ = *fmt;
+                else
+                    --fmt;
+                continue;
+        }
+
+        if (qualifier == 'l')
+            num = va_arg(args, unsigned long);
+        else if (flags & SIGN)
+            num = va_arg(args, int);
+        else
+            num = va_arg(args, unsigned int);
+
+        str = number(str, num, base, field_width, precision, flags);
+    }
+
+    *str = '\0';
+    return str - buf;
+}
+
+void
+uart_send_char(char c)
+{
+   putchar(c);
+   
+   /*
+    if(c=='\n')
+    {
+        while(io.uart.stat&1); // uart busy, wait...
+    io.uart.fifo = '\r';  
+    }
+    
+    while(io.uart.stat&1); // uart busy, wait...
+    io.uart.fifo = c;
+    */ 
+// #error "You must implement the method uart_send_char to use this file!\n";
+    /*	Output of a char to a UART usually follows the following model:
+            Wait until UART is ready
+            Write char to UART
+            Wait until UART is done
+
+            Or in code:
+            while (*UART_CONTROL_ADDRESS != UART_READY);
+            *UART_DATA_ADDRESS = c;
+            while (*UART_CONTROL_ADDRESS != UART_READY);
+
+            Check the UART sample code on your platform or the board
+       documentation.
+    */
+}
+
+int
+ee_printf(const char *fmt, ...)
+{
+    char    buf[1024], *p;
+    va_list args;
+    int     n = 0;
+
+    va_start(args, fmt);
+    ee_vsprintf(buf, fmt, args);
+    va_end(args);
+    p = buf;
+    while (*p)
+    {
+        uart_send_char(*p);
+        n++;
+        p++;
+    }
+
+    return n;
+}
--- a/FIRMWARE/DHRYSTONE/dhry.h
+++ b/FIRMWARE/DHRYSTONE/dhry.h
@@ -0,0 +1,425 @@
+/*
+ ****************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry.h (part 1 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *                      Siemens AG, AUT E 51
+ *                      Postfach 3220
+ *                      8520 Erlangen
+ *                      Germany (West)
+ *                              Phone:  [+49]-9131-7-20330
+ *                                      (8-17 Central European Time)
+ *                              Usenet: ..!mcsun!unido!estevax!weicker
+ *
+ *              Original Version (in Ada) published in
+ *              "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *              pp. 1013 - 1030, together with the statistics
+ *              on which the distribution of statements etc. is based.
+ *
+ *              In this C version, the following C library functions are used:
+ *              - strcpy, strcmp (inside the measurement loop)
+ *              - printf, scanf (outside the measurement loop)
+ *              In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *              are used for execution time measurement. For measurements
+ *              on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the
+ *      compiler; Dhrystone itself performs no OS calls in the measurement loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ ***************************************************************************
+ *
+ *  History:    This version C/2.1 has been made for two reasons:
+ *
+ *              1) There is an obvious need for a common C version of
+ *              Dhrystone, since C is at present the most popular system
+ *              programming language for the class of processors
+ *              (microcomputers, minicomputers) where Dhrystone is used most.
+ *              There should be, as far as possible, only one C version of
+ *              Dhrystone such that results can be compared without
+ *              restrictions. In the past, the C versions distributed
+ *              by Rick Richardson (Version 1.1) and by Reinhold Weicker
+ *              had small (though not significant) differences.
+ *
+ *              2) As far as it is possible without changes to the Dhrystone
+ *              statistics, optimizing compilers should be prevented from
+ *              removing significant statements.
+ *
+ *              This C version has been developed in cooperation with
+ *              Rick Richardson (Tinton Falls, NJ), it incorporates many
+ *              ideas from the "Version 1.1" distributed previously by
+ *              him over the UNIX network Usenet.
+ *              I also thank Chaim Benedelac (National Semiconductor),
+ *              David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
+ *              Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
+ *              for their help with comments on earlier versions of the
+ *              benchmark.
+ *
+ *  Changes:    In the initialization part, this version follows mostly
+ *              Rick Richardson's version distributed via Usenet, not the
+ *              version distributed earlier via floppy disk by Reinhold Weicker.
+ *              As a concession to older compilers, names have been made
+ *              unique within the first 8 characters.
+ *              Inside the measurement loop, this version follows the
+ *              version previously distributed by Reinhold Weicker.
+ *
+ *              At several places in the benchmark, code has been added,
+ *              but within the measurement loop only in branches that
+ *              are not executed. The intention is that optimizing compilers
+ *              should be prevented from moving code out of the measurement
+ *              loop, or from removing code altogether. Since the statements
+ *              that are executed within the measurement loop have NOT been
+ *              changed, the numbers defining the "Dhrystone distribution"
+ *              (distribution of statements, operand types and locality)
+ *              still hold. Except for sophisticated optimizing compilers,
+ *              execution times for this version should be the same as
+ *              for previous versions.
+ *
+ *              Since it has proven difficult to subtract the time for the
+ *              measurement loop overhead in a correct way, the loop check
+ *              has been made a part of the benchmark. This does have
+ *              an impact - though a very minor one - on the distribution
+ *              statistics which have been updated for this version.
+ *
+ *              All changes within the measurement loop are described
+ *              and discussed in the companion paper "Rationale for
+ *              Dhrystone version 2".
+ *
+ *              Because of the self-imposed limitation that the order and
+ *              distribution of the executed statements should not be
+ *              changed, there are still cases where optimizing compilers
+ *              may not generate code for some statements. To a certain
+ *              degree, this is unavoidable for small synthetic benchmarks.
+ *              Users of the benchmark are advised to check code listings
+ *              whether code is generated for all statements of Dhrystone.
+ *
+ *              Version 2.1 is identical to version 2.0 distributed via
+ *              the UNIX network Usenet in March 1988 except that it corrects
+ *              some minor deficiencies that were found by users of version 2.0.
+ *              The only change within the measurement loop is that a
+ *              non-executed "else" part was added to the "if" statement in
+ *              Func_3, and a non-executed "else" part removed from Proc_3.
+ *
+ ***************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *              -DREG=register          (default: Not defined)
+ *                      As an approximation to what an average C programmer
+ *                      might do, the "register" storage class is applied
+ *                      (if enabled by -DREG=register)
+ *                      - for local variables, if they are used (dynamically)
+ *                        five or more times
+ *                      - for parameters if they are used (dynamically)
+ *                        six or more times
+ *                      Note that an optimal "register" strategy is
+ *                      compiler-dependent, and that "register" declarations
+ *                      do not necessarily lead to faster execution.
+ *              -DNOSTRUCTASSIGN        (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      assignment of structures.
+ *              -DNOENUMS               (default: Not defined)
+ *                      Define if the C compiler does not support
+ *                      enumeration types.
+ *              -DTIMES                 (default)
+ *              -DTIME
+ *                      The "times" function of UNIX (returning process times)
+ *                      or the "time" function (returning wallclock time)
+ *                      is used for measurement.
+ *                      For single user machines, "time ()" is adequate. For
+ *                      multi-user machines where you cannot get single-user
+ *                      access, use the "times ()" function. If you have
+ *                      neither, use a stopwatch in the dead of night.
+ *                      "printf"s are provided marking the points "Start Timer"
+ *                      and "Stop Timer". DO NOT use the UNIX "time(1)"
+ *                      command, as this will measure the total time to
+ *                      run this program, which will (erroneously) include
+ *                      the time to allocate storage (malloc) and to perform
+ *                      the initialization.
+ *              -DHZ=nnn
+ *                      In Berkeley UNIX, the function "times" returns process
+ *                      time in 1/HZ seconds, with HZ = 60 for most systems.
+ *                      CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY
+ *                      A VALUE.
+ *
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  This C version of Dhrystone consists of three files:
+ *  - dhry.h (this file, containing global definitions and comments)
+ *  - dhry_1.c (containing the code corresponding to Ada package Pack_1)
+ *  - dhry_2.c (containing the code corresponding to Ada package Pack_2)
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *
+ *  Dhrystone (C version) statistics:
+ *
+ *  [Comment from the first distribution, updated for version 2.
+ *   Note that because of language differences, the numbers are slightly
+ *   different from the Ada version.]
+ *
+ *  The following program contains statements of a high level programming
+ *  language (here: C) in a distribution considered representative:
+ *
+ *    assignments                  52 (51.0 %)
+ *    control statements           33 (32.4 %)
+ *    procedure, function calls    17 (16.7 %)
+ *
+ *  103 statements are dynamically executed. The program is balanced with
+ *  respect to the three aspects:
+ *
+ *    - statement type
+ *    - operand type
+ *    - operand locality
+ *         operand global, local, parameter, or constant.
+ *
+ *  The combination of these three aspects is balanced only approximately.
+ *
+ *  1. Statement Type:
+ *  -----------------             number
+ *
+ *     V1 = V2                     9
+ *       (incl. V1 = F(..)
+ *     V = Constant               12
+ *     Assignment,                 7
+ *       with array element
+ *     Assignment,                 6
+ *       with record component
+ *                                --
+ *                                34       34
+ *
+ *     X = Y +|-|"&&"|"|" Z        5
+ *     X = Y +|-|"==" Constant     6
+ *     X = X +|- 1                 3
+ *     X = Y *|/ Z                 2
+ *     X = Expression,             1
+ *           two operators
+ *     X = Expression,             1
+ *           three operators
+ *                                --
+ *                                18       18
+ *
+ *     if ....                    14
+ *       with "else"      7
+ *       without "else"   7
+ *           executed        3
+ *           not executed    4
+ *     for ...                     7  |  counted every time
+ *     while ...                   4  |  the loop condition
+ *     do ... while                1  |  is evaluated
+ *     switch ...                  1
+ *     break                       1
+ *     declaration with            1
+ *       initialization
+ *                                --
+ *                                34       34
+ *
+ *     P (...)  procedure call    11
+ *       user procedure      10
+ *       library procedure    1
+ *     X = F (...)
+ *             function  call      6
+ *       user function        5
+ *       library function     1
+ *                                --
+ *                                17       17
+ *                                        ---
+ *                                        103
+ *
+ *    The average number of parameters in procedure or function calls
+ *    is 1.82 (not counting the function values as implicit parameters).
+ *
+ *
+ *  2. Operators
+ *  ------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *    Arithmetic             32          50.8
+ *
+ *       +                     21          33.3
+ *       -                      7          11.1
+ *       *                      3           4.8
+ *       / (int div)            1           1.6
+ *
+ *    Comparison             27           42.8
+ *
+ *       ==                     9           14.3
+ *       /=                     4            6.3
+ *       >                      1            1.6
+ *       <                      3            4.8
+ *       >=                     1            1.6
+ *       <=                     9           14.3
+ *
+ *    Logic                   4            6.3
+ *
+ *       && (AND-THEN)          1            1.6
+ *       |  (OR)                1            1.6
+ *       !  (NOT)               2            3.2
+ *
+ *                           --          -----
+ *                           63          100.1
+ *
+ *
+ *  3. Operand Type (counted once per operand reference):
+ *  ---------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *     Integer               175        72.3 %
+ *     Character              45        18.6 %
+ *     Pointer                12         5.0 %
+ *     String30                6         2.5 %
+ *     Array                   2         0.8 %
+ *     Record                  2         0.8 %
+ *                           ---       -------
+ *                           242       100.0 %
+ *
+ *  When there is an access path leading to the final operand (e.g. a record
+ *  component), only the final data type on the access path is counted.
+ *
+ *
+ *  4. Operand Locality:
+ *  -------------------
+ *                                number    approximate
+ *                                          percentage
+ *
+ *     local variable              114        47.1 %
+ *     global variable              22         9.1 %
+ *     parameter                    45        18.6 %
+ *        value                        23         9.5 %
+ *        reference                    22         9.1 %
+ *     function result               6         2.5 %
+ *     constant                     55        22.7 %
+ *                                 ---       -------
+ *                                 242       100.0 %
+ *
+ *
+ *  The program does not compute anything meaningful, but it is syntactically
+ *  and semantically correct. All variables have a value assigned to them
+ *  before they are used as a source operand.
+ *
+ *  There has been no explicit effort to account for the effects of a
+ *  cache, or to balance the use of long or short displacements for code or
+ *  data.
+ *
+ ***************************************************************************
+ */
+
+#pragma once
+
+/* Compiler and system dependent definitions: */
+
+#ifndef TIME
+#define TIMES
+#endif
+                /* Use times(2) time function unless    */
+                /* explicitly defined otherwise         */
+
+#ifdef TIMES
+#include <sys/types.h>
+#include <sys/times.h>
+                /* for "times" */
+#endif
+
+#define Mic_secs_Per_Second     80000000.0
+                /* Berkeley UNIX C returns process times in seconds/HZ */
+
+#ifdef  NOSTRUCTASSIGN
+#define structassign(d, s)      memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s)      d = s
+#endif
+
+#ifdef  NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+  typedef int   Enumeration;
+#else
+  typedef       enum    {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
+                Enumeration;
+#endif
+        /* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+//#include <stdio.h>
+                /* for strcpy, strcmp */
+
+#define Null 0
+                /* Value of a Null pointer */
+#define true  1
+#define false 0
+
+typedef int     One_Thirty;
+typedef int     One_Fifty;
+typedef char    Capital_Letter;
+typedef int     Boolean;
+typedef char    Str_30 [31];
+typedef int     Arr_1_Dim [50];
+typedef int     Arr_2_Dim [50] [50];
+
+typedef struct record
+    {
+    struct record *Ptr_Comp;
+    Enumeration    Discr;
+    union {
+          struct {
+                  Enumeration Enum_Comp;
+                  int         Int_Comp;
+                  char        Str_Comp [31];
+                  } var_1;
+          struct {
+                  Enumeration E_Comp_2;
+                  char        Str_2_Comp [31];
+                  } var_2;
+          struct {
+                  char        Ch_1_Comp;
+                  char        Ch_2_Comp;
+                  } var_3;
+          } variant;
+      } Rec_Type, *Rec_Pointer;
+
+
--- a/FIRMWARE/DHRYSTONE/dhry_1.c
+++ b/FIRMWARE/DHRYSTONE/dhry_1.c
@@ -0,0 +1,384 @@
+/*
+ ****************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_1.c (part 2 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ ****************************************************************************
+ */
+
+#include "dhry.h"
+#include <stdint.h>
+
+/* Global Variables: */
+
+Rec_Pointer     Ptr_Glob,
+                Next_Ptr_Glob;
+int             Int_Glob;
+Boolean         Bool_Glob;
+char            Ch_1_Glob,
+                Ch_2_Glob;
+int             Arr_1_Glob [50];
+int             Arr_2_Glob [50] [50];
+
+Enumeration     Func_1 ();
+  /* forward declaration necessary since Enumeration may not simply be int */
+
+#ifndef REG
+        Boolean Reg = false;
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#else
+        Boolean Reg = true;
+#endif
+
+/* variables for time measurement: */
+extern uint64_t rdcycle();
+extern uint64_t rdinstret();
+uint64_t        Begin_Time,
+                End_Time,
+                User_Time;
+uint64_t        Begin_Insn,
+                End_Insn,
+                User_Insn;
+/* end of variables for time measurement */
+
+
+main ()
+/*****/
+
+  /* main program, corresponds to procedures        */
+  /* Main and Proc_0 in the Ada version             */
+{
+        One_Fifty       Int_1_Loc;
+  REG   One_Fifty       Int_2_Loc;
+        One_Fifty       Int_3_Loc;
+  REG   char            Ch_Index;
+        Enumeration     Enum_Loc;
+        Str_30          Str_1_Loc;
+        Str_30          Str_2_Loc;
+  REG   int             Run_Index;
+  REG   int             Number_Of_Runs;
+
+  Rec_Type R1,R2;
+ 
+  /* Initializations */
+
+   
+  /*
+   * FEMTOSOC/FEMTORV32 modifications ===========================
+   */ 
+   
+  /*
+   * Since there are only two calls to malloc(), and that malloc()
+   * is not supported yet by femtosoc lib, I replaced them with
+   * pre-allocated structures.
+   */ 
+  Next_Ptr_Glob = &R1; // (Rec_Pointer) malloc (sizeof (Rec_Type));
+  Ptr_Glob = &R2; // (Rec_Pointer) malloc (sizeof (Rec_Type));
+
+  /*
+   * End of FEMTOSOC/FEMTORV32 modifications ======================
+   */ 
+  Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
+  Ptr_Glob->Discr                       = Ident_1;
+  Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
+  Ptr_Glob->variant.var_1.Int_Comp      = 40;
+  strcpy (Ptr_Glob->variant.var_1.Str_Comp,
+          "DHRYSTONE PROGRAM, SOME STRING");
+  strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+  Arr_2_Glob [8][7] = 10;
+        /* Was missing in published program. Without this statement,    */
+        /* Arr_2_Glob [8][7] would have an undefined value.             */
+        /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */
+        /* overflow may occur for this array element.                   */
+
+  printf ("\n");
+  printf ("Dhrystone Benchmark, Version 2.1 (Language: C)\n");
+  printf ("\n");
+  if (Reg)
+  {
+    printf ("Program compiled with 'register' attribute\n");
+    printf ("\n");
+  }
+  else
+  {
+    printf ("Program compiled without 'register' attribute\n");
+    printf ("\n");
+  }
+  printf ("Please give the number of runs through the benchmark: ");
+  {
+    // int n;
+    // scanf ("%d", &n);
+    Number_Of_Runs = 50000;
+  }
+  printf ("\n");
+
+  printf ("Execution starts, %d runs through Dhrystone\n", Number_Of_Runs);
+
+  /***************/
+  /* Start timer */
+  /***************/
+
+  Begin_Time = rdcycle();
+  Begin_Insn = rdinstret();
+
+  printf(">>> Begin_time=%d\n", (int)Begin_Time);
+  printf(">>> Begin_insn=%d\n", (int)Begin_Insn);
+  
+  for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
+  {
+    Proc_5();
+    Proc_4();
+      /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+    Int_1_Loc = 2;
+    Int_2_Loc = 3;
+    strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+    Enum_Loc = Ident_2;
+    Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
+      /* Bool_Glob == 1 */
+    while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
+    {
+      Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+        /* Int_3_Loc == 7 */
+      Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+        /* Int_3_Loc == 7 */
+      Int_1_Loc += 1;
+    } /* while */
+      /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+    Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+      /* Int_Glob == 5 */
+    Proc_1 (Ptr_Glob);
+    for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+                             /* loop body executed twice */
+    {
+      if (Enum_Loc == Func_1 (Ch_Index, 'C'))
+          /* then, not executed */
+        {
+        Proc_6 (Ident_1, &Enum_Loc);
+        strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+        Int_2_Loc = Run_Index;
+        Int_Glob = Run_Index;
+        }
+    }
+      /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+    Int_2_Loc = Int_2_Loc * Int_1_Loc;
+    Int_1_Loc = Int_2_Loc / Int_3_Loc;
+    Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+      /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+    Proc_2 (&Int_1_Loc);
+      /* Int_1_Loc == 5 */
+
+  } /* loop "for Run_Index" */
+
+  /**************/
+  /* Stop timer */
+  /**************/
+
+  End_Time = rdcycle();
+  End_Insn = rdinstret();
+
+  printf ("Execution ends\n");
+  printf ("\n");
+  printf ("Final values of the variables used in the benchmark:\n");
+  printf ("\n");
+  printf ("Int_Glob:            %d\n", Int_Glob);
+  printf ("        should be:   %d\n", 5);
+  printf ("Bool_Glob:           %d\n", Bool_Glob);
+  printf ("        should be:   %d\n", 1);
+  printf ("Ch_1_Glob:           %c\n", Ch_1_Glob);
+  printf ("        should be:   %c\n", 'A');
+  printf ("Ch_2_Glob:           %c\n", Ch_2_Glob);
+  printf ("        should be:   %c\n", 'B');
+  printf ("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);
+  printf ("        should be:   %d\n", 7);
+  printf ("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);
+  printf ("        should be:   Number_Of_Runs + 10\n");
+  printf ("Ptr_Glob->\n");
+  printf ("  Ptr_Comp:          %d\n", (int) Ptr_Glob->Ptr_Comp);
+  printf ("        should be:   (implementation-dependent)\n");
+  printf ("  Discr:             %d\n", Ptr_Glob->Discr);
+  printf ("        should be:   %d\n", 0);
+  printf ("  Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+  printf ("        should be:   %d\n", 2);
+  printf ("  Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);
+  printf ("        should be:   %d\n", 17);
+  printf ("  Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);
+  printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  printf ("Next_Ptr_Glob->\n");
+  printf ("  Ptr_Comp:          %d\n", (int) Next_Ptr_Glob->Ptr_Comp);
+  printf ("        should be:   (implementation-dependent), same as above\n");
+  printf ("  Discr:             %d\n", Next_Ptr_Glob->Discr);
+  printf ("        should be:   %d\n", 0);
+  printf ("  Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+  printf ("        should be:   %d\n", 1);
+  printf ("  Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
+  printf ("        should be:   %d\n", 18);
+  printf ("  Str_Comp:          %s\n",
+                                Next_Ptr_Glob->variant.var_1.Str_Comp);
+  printf ("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  printf ("Int_1_Loc:           %d\n", Int_1_Loc);
+  printf ("        should be:   %d\n", 5);
+  printf ("Int_2_Loc:           %d\n", Int_2_Loc);
+  printf ("        should be:   %d\n", 13);
+  printf ("Int_3_Loc:           %d\n", Int_3_Loc);
+  printf ("        should be:   %d\n", 7);
+  printf ("Enum_Loc:            %d\n", Enum_Loc);
+  printf ("        should be:   %d\n", 1);
+  printf ("Str_1_Loc:           %s\n", Str_1_Loc);
+  printf ("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");
+  printf ("Str_2_Loc:           %s\n", Str_2_Loc);
+  printf ("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");
+  printf ("\n");
+
+  User_Time = End_Time - Begin_Time;
+  User_Insn = End_Insn - Begin_Insn;
+
+  printf("Number_Of_Runs: %d\n", Number_Of_Runs);
+  printf("User_Time: %d cycles, %d insn\n", (int)User_Time, (int)User_Insn);
+
+  uint64_t Cycles_Per_Instruction_x1000 = (1000 * User_Time) / User_Insn;
+  printf("Cycles_Per_Instruction: %d.%d%d%d\n",
+	 (int)( Cycles_Per_Instruction_x1000 / 1000),
+	 (int)((Cycles_Per_Instruction_x1000 / 100 ) % 10),
+	 (int)((Cycles_Per_Instruction_x1000 / 10  ) % 10),
+	 (int)((Cycles_Per_Instruction_x1000 / 1   ) % 10)
+  );
+
+  show_CPI_2();
+
+  uint64_t Dhrystones_Per_Second_Per_MHz = ((uint64_t)Number_Of_Runs * 1000000) / User_Time;
+  printf("Dhrystones_Per_Second_Per_MHz: %d\n", (int)Dhrystones_Per_Second_Per_MHz);
+  
+   /*
+    * "Another common representation of the Dhrystone benchmark is the DMIPS (Dhrystone MIPS) obtained 
+    * when the Dhrystone score is divided by 1757 (the number of Dhrystones per second obtained on the 
+    * VAX 11/780, nominally a 1 MIPS machine)."
+    */
+   
+  int DMIPS_Per_MHz_x1000 = ((uint64_t)1000 * Dhrystones_Per_Second_Per_MHz) / 1757;
+  printf("DMIPS_Per_MHz: %d.%d%d%d\n",
+	 (int)(DMIPS_Per_MHz_x1000 / 1000),
+	 (int)((DMIPS_Per_MHz_x1000 / 100) % 10),
+	 (int)((DMIPS_Per_MHz_x1000 / 10) % 10),
+	 (int)((DMIPS_Per_MHz_x1000 / 1) % 10));
+  return 0;
+}
+
+
+Proc_1 (Ptr_Val_Par)
+/******************/
+
+REG Rec_Pointer Ptr_Val_Par;
+    /* executed once */
+{
+  REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+                                        /* == Ptr_Glob_Next */
+  /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+  /* corresponds to "rename" in Ada, "with" in Pascal           */
+
+  structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+  Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+  Next_Record->variant.var_1.Int_Comp
+        = Ptr_Val_Par->variant.var_1.Int_Comp;
+  Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+  Proc_3 (&Next_Record->Ptr_Comp);
+    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+                        == Ptr_Glob->Ptr_Comp */
+  if (Next_Record->Discr == Ident_1)
+    /* then, executed */
+  {
+    Next_Record->variant.var_1.Int_Comp = 6;
+    Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
+           &Next_Record->variant.var_1.Enum_Comp);
+    Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+    Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
+           &Next_Record->variant.var_1.Int_Comp);
+  }
+  else /* not executed */
+    structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+} /* Proc_1 */
+
+
+Proc_2 (Int_Par_Ref)
+/******************/
+    /* executed once */
+    /* *Int_Par_Ref == 1, becomes 4 */
+
+One_Fifty   *Int_Par_Ref;
+{
+  One_Fifty  Int_Loc;
+  Enumeration   Enum_Loc;
+
+  Int_Loc = *Int_Par_Ref + 10;
+  do /* executed once */
+    if (Ch_1_Glob == 'A')
+      /* then, executed */
+    {
+      Int_Loc -= 1;
+      *Int_Par_Ref = Int_Loc - Int_Glob;
+      Enum_Loc = Ident_1;
+    } /* if */
+  while (Enum_Loc != Ident_1); /* true */
+} /* Proc_2 */
+
+
+Proc_3 (Ptr_Ref_Par)
+/******************/
+    /* executed once */
+    /* Ptr_Ref_Par becomes Ptr_Glob */
+
+Rec_Pointer *Ptr_Ref_Par;
+
+{
+  if (Ptr_Glob != Null)
+    /* then, executed */
+    *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+  Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+} /* Proc_3 */
+
+
+Proc_4 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Boolean Bool_Loc;
+
+  Bool_Loc = Ch_1_Glob == 'A';
+  Bool_Glob = Bool_Loc | Bool_Glob;
+  Ch_2_Glob = 'B';
+} /* Proc_4 */
+
+
+Proc_5 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Ch_1_Glob = 'A';
+  Bool_Glob = false;
+} /* Proc_5 */
+
+
+        /* Procedure for the assignment of structures,          */
+        /* if the C compiler doesn't support this feature       */
+#ifdef  NOSTRUCTASSIGN
+memcpy (d, s, l)
+register char   *d;
+register char   *s;
+register int    l;
+{
+        while (l--) *d++ = *s++;
+}
+#endif
+
+
--- a/FIRMWARE/DHRYSTONE/dhry_2.c
+++ b/FIRMWARE/DHRYSTONE/dhry_2.c
@@ -0,0 +1,192 @@
+/*
+ ****************************************************************************
+ *
+ *                   "DHRYSTONE" Benchmark Program
+ *                   -----------------------------
+ *
+ *  Version:    C, Version 2.1
+ *
+ *  File:       dhry_2.c (part 3 of 3)
+ *
+ *  Date:       May 25, 1988
+ *
+ *  Author:     Reinhold P. Weicker
+ *
+ ****************************************************************************
+ */
+
+#include "dhry.h"
+
+#ifndef REG
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#endif
+
+extern  int     Int_Glob;
+extern  char    Ch_1_Glob;
+
+
+Proc_6 (Enum_Val_Par, Enum_Ref_Par)
+/*********************************/
+    /* executed once */
+    /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+Enumeration  Enum_Val_Par;
+Enumeration *Enum_Ref_Par;
+{
+  *Enum_Ref_Par = Enum_Val_Par;
+  if (! Func_3 (Enum_Val_Par))
+    /* then, not executed */
+    *Enum_Ref_Par = Ident_4;
+  switch (Enum_Val_Par)
+  {
+    case Ident_1:
+      *Enum_Ref_Par = Ident_1;
+      break;
+    case Ident_2:
+      if (Int_Glob > 100)
+        /* then */
+      *Enum_Ref_Par = Ident_1;
+      else *Enum_Ref_Par = Ident_4;
+      break;
+    case Ident_3: /* executed */
+      *Enum_Ref_Par = Ident_2;
+      break;
+    case Ident_4: break;
+    case Ident_5:
+      *Enum_Ref_Par = Ident_3;
+      break;
+  } /* switch */
+} /* Proc_6 */
+
+
+Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref)
+/**********************************************/
+    /* executed three times                                      */
+    /* first call:      Int_1_Par_Val == 2, Int_2_Par_Val == 3,  */
+    /*                  Int_Par_Ref becomes 7                    */
+    /* second call:     Int_1_Par_Val == 10, Int_2_Par_Val == 5, */
+    /*                  Int_Par_Ref becomes 17                   */
+    /* third call:      Int_1_Par_Val == 6, Int_2_Par_Val == 10, */
+    /*                  Int_Par_Ref becomes 18                   */
+One_Fifty       Int_1_Par_Val;
+One_Fifty       Int_2_Par_Val;
+One_Fifty      *Int_Par_Ref;
+{
+  One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 2;
+  *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+} /* Proc_7 */
+
+
+Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
+/*********************************************************************/
+    /* executed once      */
+    /* Int_Par_Val_1 == 3 */
+    /* Int_Par_Val_2 == 7 */
+Arr_1_Dim       Arr_1_Par_Ref;
+Arr_2_Dim       Arr_2_Par_Ref;
+int             Int_1_Par_Val;
+int             Int_2_Par_Val;
+{
+  REG One_Fifty Int_Index;
+  REG One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 5;
+  Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
+  Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
+  Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
+  for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
+    Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
+  Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
+  Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
+  Int_Glob = 5;
+} /* Proc_8 */
+
+
+Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
+/*************************************************/
+    /* executed three times                                         */
+    /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+    /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+    /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+
+Capital_Letter   Ch_1_Par_Val;
+Capital_Letter   Ch_2_Par_Val;
+{
+  Capital_Letter        Ch_1_Loc;
+  Capital_Letter        Ch_2_Loc;
+
+  Ch_1_Loc = Ch_1_Par_Val;
+  Ch_2_Loc = Ch_1_Loc;
+  if (Ch_2_Loc != Ch_2_Par_Val)
+    /* then, executed */
+    return (Ident_1);
+  else  /* not executed */
+  {
+    Ch_1_Glob = Ch_1_Loc;
+    return (Ident_2);
+   }
+} /* Func_1 */
+
+
+Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
+/*************************************************/
+    /* executed once */
+    /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+    /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+Str_30  Str_1_Par_Ref;
+Str_30  Str_2_Par_Ref;
+{
+  REG One_Thirty        Int_Loc;
+      Capital_Letter    Ch_Loc;
+
+  Int_Loc = 2;
+  while (Int_Loc <= 2) /* loop body executed once */
+    if (Func_1 (Str_1_Par_Ref[Int_Loc],
+                Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
+      /* then, executed */
+    {
+      Ch_Loc = 'A';
+      Int_Loc += 1;
+    } /* if, while */
+  if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+    /* then, not executed */
+    Int_Loc = 7;
+  if (Ch_Loc == 'R')
+    /* then, not executed */
+    return (true);
+  else /* executed */
+  {
+    if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+      /* then, not executed */
+    {
+      Int_Loc += 7;
+      Int_Glob = Int_Loc;
+      return (true);
+    }
+    else /* executed */
+      return (false);
+  } /* if Ch_Loc */
+} /* Func_2 */
+
+
+Boolean Func_3 (Enum_Par_Val)
+/***************************/
+    /* executed once        */
+    /* Enum_Par_Val == Ident_3 */
+Enumeration Enum_Par_Val;
+{
+  Enumeration Enum_Loc;
+
+  Enum_Loc = Enum_Par_Val;
+  if (Enum_Loc == Ident_3)
+    /* then, executed */
+    return (true);
+  else /* not executed */
+    return (false);
+} /* Func_3 */
+
--- a/FIRMWARE/DHRYSTONE/stubs.c
+++ b/FIRMWARE/DHRYSTONE/stubs.c
@@ -0,0 +1,56 @@
+#include <stdint.h>
+#include <perf.h>
+
+uint64_t time() {
+    return rdcycle();
+}
+
+uint64_t insn() {
+    return rdinstret();
+}
+
+char *strcpy(char *dest, const char *src) {
+   char* result = dest;
+   while(*dest++=*src++);
+   return result;
+}
+
+int strcmp (const char *p1, const char *p2)  {
+   const unsigned char *s1 = (const unsigned char *) p1;
+   const unsigned char *s2 = (const unsigned char *) p2;
+   unsigned char c1, c2;
+   do {
+      c1 = (unsigned char) *s1++;
+      c2 = (unsigned char) *s2++;
+      if (c1 == '\0') {
+	 return c1 - c2;
+      }
+   }
+   while (c1 == c2);
+   return c1 - c2;
+}
+
+/*************************************************************/
+
+// Print "fixed point" number (integer/1000)
+void printk(uint64_t kx) {
+    int intpart  = (int)(kx / 1000);
+    int fracpart = (int)(kx % 1000);
+    printf("%d.",intpart);
+    if(fracpart<100) {
+	printf("0");
+    }
+    if(fracpart<10) {
+	printf("0");
+    }
+    printf("%d",fracpart);
+}
+
+void show_CPI_2() {
+   uint64_t instret = rdinstret();
+   uint64_t cycles  = rdcycle();
+   uint64_t kCPI    = cycles*1000/instret;
+   printf(">>> CPI ="); printk(kCPI); printf("\n");
+   printf(">>> instret = %d\n", (int)(instret));
+   printf(">>> cycles  = %d\n", (int)(cycles));   
+}
--- a/FIRMWARE/GL_tty.h
+++ b/FIRMWARE/GL_tty.h
@@ -0,0 +1,460 @@
+/**
+ * ansi_graphics.h
+ * A couple of function to display graphics in the terminal, 
+ * using ansi sequences.
+ * Bruno Levy, Jan 2024
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#ifndef GL_FPS
+#define GL_FPS 30
+#endif
+
+#if defined(__linux__) || defined(_WIN32) || defined(__APPLE__)
+#define BIGCPU  // we are compiling for a real machine
+#else
+#define TINYCPU // we are compiling for a softwore
+#endif
+
+#ifdef __linux__
+#include <unistd.h> // for usleep()
+#endif
+
+// You can define GL_width and GL_height before
+// #including ansi_graphics.h in case the plain
+// old 80x25 pixels does not suffice.
+
+#ifndef GL_width
+#define GL_width  80
+#endif
+
+#ifndef GL_height
+#define GL_height 25
+#endif
+
+/**
+ * \brief Sets the current graphics position
+ * \param[in] x typically in 0,79
+ * \param[in] y typically in 0,24
+ */
+static inline void GL_gotoxy(int x, int y) {
+    printf("\033[%d;%dH",y,x);
+}
+
+/**
+ * \brief Sets the current graphics position
+ * \param[in] R , G , B the RGB color of the pixel, in [0..255]
+ * \details Typically used by programs that draw all pixels sequentially,
+ *  like a raytracer. After each line, one can either printf("\n") or
+ *  call GL_gotoxy(). If you want to draw individual pixels in an
+ *  arbitrary order, use GL_setpixelRGB(x,y,R,G,B)
+ */
+static inline void GL_setpixelRGBhere(uint8_t R, uint8_t G, uint8_t B) {
+    // set background color, print space 
+    printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B); 
+}
+
+
+/**
+ * \brief Draws two "pixels" at the current
+ *  cursor position and advances the current cursor
+ *  position.
+ * \details Characters are roughly twice as high as wide.
+ *  To generate square pixels, this function draws two pixels in
+ *  the same character, using the special lower-half white / upper-half
+ *  black character, and setting the background and foreground colors.
+ */
+static inline void GL_set2pixelsRGBhere(
+    uint8_t r1, uint8_t g1, uint8_t b1,
+    uint8_t r2, uint8_t g2, uint8_t b2
+) {
+    if((r2 == r1) && (g2 == g1) && (b2 == b1)) {
+	GL_setpixelRGBhere(r1,g1,b1);
+    } else {
+	printf("\033[48;2;%d;%d;%dm",(int)r1,(int)g1,(int)b1);	   	   
+	printf("\033[38;2;%d;%d;%dm",(int)r2,(int)g2,(int)b2);
+	// https://www.w3.org/TR/xml-entity-names/025.html
+	// https://onlineunicodetools.com/convert-unicode-to-utf8
+	// https://copypastecharacter.com/
+	printf("\xE2\x96\x83");
+    }
+}
+
+#define GL_RGB(R,G,B) #R ";" #G ";" #B 
+
+static inline void GL_setpixelIhere(
+    const char** cmap, int c
+) {
+    // set background color, print space 
+    printf("\033[48;2;%sm ",cmap[c]); 
+}
+
+static inline void GL_set2pixelsIhere(
+    const char** cmap, int c1, int c2
+) {
+    if(c1 == c2) {
+	GL_setpixelIhere(cmap, c1);
+    } else {
+	printf("\033[48;2;%sm",cmap[c1]);	   	   
+	printf("\033[38;2;%sm",cmap[c2]);
+	// https://www.w3.org/TR/xml-entity-names/025.html
+	// https://onlineunicodetools.com/convert-unicode-to-utf8
+	// https://copypastecharacter.com/
+	printf("\xE2\x96\x83");
+    }
+}
+
+/**
+ * \brief Moves the cursor position to the next line.
+ * \details Background and foreground colors are set to black.
+ */
+static inline void GL_newline() {
+    printf("\033[38;2;0;0;0m");	   
+    printf("\033[48;2;0;0;0m\n");
+}
+
+/**
+ * \brief Sets the color of a pixel
+ * \param[in] x typically in 0,79
+ * \param[in] y typically in 0,24
+ * \param[in] R , G , B the RGB color of the pixel, in [0..255]
+ */
+static inline void GL_setpixelRGB(
+    int x, int y, uint8_t R, uint8_t G, uint8_t B
+) {
+    GL_gotoxy(x,y);
+    GL_setpixelRGBhere(R,G,B);
+}
+
+/**
+ * \brief restore default foreground and background colors
+ */
+static inline void GL_restore_default_colors() {
+    printf(
+        "\033[48;5;16m"   // set background color black
+        "\033[38;5;15m"   // set foreground color white
+    );
+}
+
+/**
+ * \brief Call this function each time graphics should be cleared
+ */
+static inline void GL_clear() {
+    GL_restore_default_colors();
+    printf("\033[2J"); // clear screen
+}
+
+/**
+ * \brief Moves current drawing position to top-left corner
+ * \see GL_setpixelRGBhere() and GL_set2pixelsRGBhere()
+ */
+static inline void GL_home() {
+    printf("\033[H");
+}
+
+/**
+ * \brief Call this function before starting drawing graphics 
+ *  or each time graphics should be cleared
+ */
+static inline void GL_init() {
+    printf("\033[?25l"); // hide cursor
+    GL_home();
+    GL_clear();
+}
+
+
+/**
+ * \brief Call this function at the end of the program
+ */
+static inline void GL_terminate() {
+    GL_restore_default_colors();
+    GL_gotoxy(0,GL_height);
+    printf("\033[?25h"); // show cursor
+}
+
+/**
+ * \brief Flushes pending graphic operations and waits a bit
+ */
+static inline void GL_swapbuffers() {
+    // only flush if we are on a big machine, with true stdio support
+    // otherwise does nothing (because our small MCU io lib is not buffered)
+#ifdef BIGCPU    
+   fflush(stdout);
+#endif
+#ifdef __linux__   
+   usleep(1000000/GL_FPS);
+#endif
+}
+
+typedef void (*GL_pixelfunc_RGB)(int x, int y, uint8_t* r, uint8_t* g, uint8_t* b);
+typedef void (*GL_pixelfunc_RGBf)(int x, int y, float* r, float* g, float* b);
+
+/**
+ * \brief Draws an image by calling a user-specified function for each pixel.
+ * \param[in] width , height dimension of the image in square pixels
+ * \param[in] do_pixel the user function to be called for each pixel 
+ *  (a "shader"), that determines the (integer) components r,g,b of 
+ *   the pixel's color.
+ * \details Uses half-charater pixels.
+ */
+static inline void GL_scan_RGB(
+    int width, int height, GL_pixelfunc_RGB do_pixel
+) {
+    uint8_t r1, g1, b1;
+    uint8_t r2, g2, b2;
+    GL_home(); 
+    for (int j = 0; j<height; j+=2) { 
+	for (int i = 0; i<width; i++) {
+	    do_pixel(i,j  , &r1, &g1, &b1);
+	    do_pixel(i,j+1, &r2, &g2, &b2);
+	    GL_set2pixelsRGBhere(r1,g1,b1,r2,g2,b2);
+	    if(i == width-1) {
+		GL_newline();
+	    }
+	}
+    }
+}
+
+/**
+ * brief Converts a floating point value to a byte.
+ * \param[in] the floating point value in [0,1]
+ * \return the byte, in [0,255]
+ * \details the input value is clamped to [0,1]
+ */ 
+static inline uint8_t GL_ftoi(float f) {
+    f = (f < 0.0f) ? 0.0f : f;
+    f = (f > 1.0f) ? 1.0f : f;
+    return (uint8_t)(255.0f * f);
+}
+
+/**
+ * \brief Draws an image by calling a user-specified function for each pixel.
+ * \param[in] width , height dimension of the image in square pixels
+ * \param[in] do_pixel the user function to be called for each pixel 
+ *  (a "shader"), that determines the (floating-point) components 
+ *  fr,fg,fb of the pixel's color.
+ * \details Uses half-charater pixels.
+ */
+static inline void GL_scan_RGBf(
+    int width, int height, GL_pixelfunc_RGBf do_pixel
+) {
+    float fr1, fg1, fb1;
+    float fr2, fg2, fb2;
+    uint8_t r1, g1, b1;
+    uint8_t r2, g2, b2;
+    GL_home();
+    for (int j = 0; j<height; j+=2) { 
+	for (int i = 0; i<width; i++) {
+	    do_pixel(i,j  , &fr1, &fg1, &fb1);
+	    r1 = GL_ftoi(fr1);
+	    g1 = GL_ftoi(fg1);
+	    b1 = GL_ftoi(fb1);	    
+	    do_pixel(i,j+1, &fr2, &fg2, &fb2);
+	    r2 = GL_ftoi(fr2);
+	    g2 = GL_ftoi(fg2);
+	    b2 = GL_ftoi(fb2);	    
+	    GL_set2pixelsRGBhere(r1,g1,b1,r2,g2,b2);
+	    if(i == width-1) {
+		GL_newline();
+	    }
+	}
+    }
+}
+
+/***************************************************************/
+
+#define INSIDE 0
+#define LEFT   1
+#define RIGHT  2
+#define BOTTOM 4
+#define TOP    8
+
+#define XMIN 0
+#define XMAX (GL_width-1)
+#define YMIN 0
+#define YMAX (GL_height-1)
+
+#define code(x,y) \
+    ((x) < XMIN) | (((x) > XMAX)<<1) | (((y) < YMIN)<<2) | (((y) > YMAX)<<3) 
+
+/***************************************************************/
+
+static inline void GL_line(
+    int x1, int y1, int x2, int y2, int R, int G, int B
+) {
+    int x,y,dx,dy,sx,sy,tmp;
+
+    /* Cohen-Sutherland line clipping. */
+    int code1 = code(x1,y1);
+    int code2 = code(x2,y2);
+    int codeout;
+
+    for(;;) {
+	/* Both points inside. */
+	if(code1 == 0 && code2 == 0) {
+	    break;
+	}
+
+	/* No point inside. */
+	if(code1 & code2) {
+	    return;
+	}
+
+	/* One of the points is outside. */
+	codeout = code1 ? code1 : code2;
+
+	/* Compute intersection. */
+	if (codeout & TOP) { 
+	    x = x1 + (x2 - x1) * (YMAX - y1) / (y2 - y1); 
+	    y = YMAX; 
+	} else if (codeout & BOTTOM) { 
+	    x = x1 + (x2 - x1) * (YMIN - y1) / (y2 - y1); 
+	    y = YMIN; 
+	}  else if (codeout & RIGHT) { 
+	    y = y1 + (y2 - y1) * (XMAX - x1) / (x2 - x1); 
+	    x = XMAX; 
+	} else if (codeout & LEFT) { 
+	    y = y1 + (y2 - y1) * (XMIN - x1) / (x2 - x1); 
+	    x = XMIN; 
+	} 
+	
+	/* Replace outside point with intersection. */
+	if (codeout == code1) { 
+	    x1 = x; 
+	    y1 = y;
+	    code1 = code(x1,y1);
+	} else { 
+	    x2 = x; 
+	    y2 = y;
+	    code2 = code(x2,y2);
+	}
+    }
+    
+    // Swap both extremities to ensure x increases
+    if(x2 < x1) {
+       tmp = x2;
+       x2 = x1;
+       x1 = tmp;
+       tmp = y2;
+       y2 = y1;
+       y1 = tmp;
+    }
+   
+    // Bresenham line drawing.
+    dy = y2 - y1;
+    sy = 1;
+    if(dy < 0) {
+	sy = -1;
+	dy = -dy;
+    }
+
+    dx = x2 - x1;
+   
+    x = x1;
+    y = y1;
+    
+    if(dy > dx) {
+	int ex = (dx << 1) - dy;
+	for(int u=0; u<dy; u++) {
+	    GL_setpixelRGB(x,y,R,G,B);
+	    y += sy;
+	    if(ex >= 0)  {
+		x++;
+		ex -= dy << 1;
+		GL_setpixelRGB(x,y,R,G,B);
+	    }
+	    while(ex >= 0)  {
+		x++;
+		ex -= dy << 1;
+	        putchar(' ');
+	    }
+	    ex += dx << 1;
+	}
+    } else {
+	int ey = (dy << 1) - dx;
+	for(int u=0; u<dx; u++) {
+	    GL_setpixelRGB(x,y,R,G,B);
+	    x++;
+	    while(ey >= 0) {
+		y += sy;
+		ey -= dx << 1;
+		GL_setpixelRGB(x,y,R,G,B);
+	    }
+	    ey += dy << 1;
+	}
+    }
+}
+
+
+/***************************************************************/
+
+#ifdef GL_USE_TURTLE
+
+#include "sintab.h" // Ugly !!!
+
+typedef struct {
+    int x;        // in [0..79]
+    int y;        // in [0..24]
+    int angle;    // in degrees
+    int R,G,B;    // pen color
+    int pendown;  // draw if non-zero
+} Turtle;
+    
+static inline void Turtle_init(Turtle* T) {
+    T->x = GL_width/2;
+    T->y = GL_height/2;
+    T->angle = -90;
+    T->pendown = 1;
+    T->R = 255;
+    T->G = 255;
+    T->B = 255;
+}
+
+static inline void Turtle_pen_up(Turtle* T) {
+    T->pendown = 0;
+}
+
+static inline void Turtle_pen_down(Turtle* T) {
+    T->pendown = 1;
+}
+
+static inline void Turtle_pen_color(Turtle* T, int R, int G, int B) {
+    T->R = R;
+    T->G = G;
+    T->B = B;
+}
+
+static inline void Turtle_forward(Turtle* T, int distance) {
+    int last_x = T->x;
+    int last_y = T->y;
+    int a = T->angle;
+    while(a < 0) {
+        a += 360;
+    }
+    while(a > 360) {
+        a -= 360;
+    }
+    T->x += (costab[a] * distance) / 256;
+    T->y += (sintab[a] * distance) / 256;
+    if(T->pendown) {
+        GL_line(last_x, last_y, T->x, T->y, T->R, T->G, T->B);
+    }
+}
+
+static inline void Turtle_backward(Turtle* T, int distance) {
+    Turtle_forward(T,-distance);
+}
+
+static inline void Turtle_turn_right(Turtle* T, int delta_angle) {
+    T->angle += delta_angle;
+}
+
+static inline void Turtle_turn_left(Turtle* T, int delta_angle) {
+    Turtle_turn_right(T, -delta_angle);
+}
+
+#endif
--- a/FIRMWARE/Makefile
+++ b/FIRMWARE/Makefile
@@ -0,0 +1,80 @@
+include ../../../FIRMWARE/makefile.inc
+RVASFLAGS=-march=$(ARCH) -mabi=$(ABI) 
+RVCFLAGS=-I. -O2 -fno-pic -march=$(ARCH) -mabi=$(ABI) -fno-stack-protector -w -Wl,--no-relax
+
+RAM_SIZE=6144
+
+LIBOBJECTS=putchar.o wait.o print.o memcpy.o errno.o perf.o
+
+%.bram.elf: %.o start.o $(LIBOBJECTS) $(RV_BINARIES)
+	$(RVLD) -T bram.ld -m elf32lriscv -nostdlib -norelax $< $(LIBOBJECTS) $(RVTOOLCHAIN_GCC_LIB_DIR)/libgcc.a -o $@
+
+
+%.hex: %.elf $(FIRMWARE_DIR)/TOOLS/firmware_words 
+	$(FIRMWARE_DIR)/TOOLS/firmware_words $< -ram $(RAM_SIZE) -max_addr $(RAM_SIZE) -out $@
+	cp $@ ../firmware.hex
+	mkdir -p ../obj_dir
+	cp $@ ../obj_dir/firmware.hex
+	echo $@ > ../firmware.txt
+
+
+# SPI FLASH 0 (sends everything to SPI flash)
+
+%.spiflash0.elf: %.o start.o $(LIBOBJECTS) $(RV_BINARIES)
+	$(RVLD) -T spiflash0.ld -m elf32lriscv -nostdlib -norelax $< $(LIBOBJECTS) $(RVTOOLCHAIN_GCC_LIB_DIR)/libgcc.a -o $@
+
+%.spiflash0.bin: %.spiflash0.elf
+	$(RVOBJCOPY) $< $@ -O binary
+
+%.spiflash0.prog: %.spiflash0.bin
+	iceprog -o 128k $<
+
+# SPI FLASH 1 (sends code and variables initialization to SPI flash, variables to RAM)
+
+%.spiflash1.elf: %.o start_spiflash1.o $(LIBOBJECTS) $(RV_BINARIES)
+	$(RVLD) -T spiflash1.ld -m elf32lriscv -nostdlib -norelax $< $(LIBOBJECTS) $(RVTOOLCHAIN_GCC_LIB_DIR)/libgcc.a -o $@
+
+
+%.spiflash1.bin: %.spiflash1.elf
+	$(RVOBJCOPY) $< $@ -O binary
+
+%.spiflash1.prog: %.spiflash1.bin
+	iceprog -o 128k $<
+
+
+# SPI FLASH 2 (sends code and variables initialization to SPI flash, variables and fastcode to RAM)
+
+%.spiflash2.elf: %.o start_spiflash1.o $(LIBOBJECTS) $(RV_BINARIES)
+	$(RVLD) -T spiflash2.ld -m elf32lriscv -nostdlib -norelax $< $(LIBOBJECTS) -L$(RVTOOLCHAIN_LIB_DIR) -lm  $(RVTOOLCHAIN_GCC_LIB_DIR)/libgcc.a -o $@
+
+
+%.spiflash2.bin: %.spiflash2.elf
+	$(RVOBJCOPY) $< $@ -O binary
+
+%.spiflash2.prog: %.spiflash2.bin
+	iceprog -o 128k $<
+
+%.spiflash2.list: %.spiflash2.elf
+	$(RVOBJDUMP) -Mnumeric -D $< > $@
+
+# DUAL MEMORY (64 kb program ROM, 64 kb data RAM)
+
+%.pipeline.elf: %.o start_pipeline.o $(LIBOBJECTS) $(RV_BINARIES)
+	$(RVLD) -T pipeline.ld -m elf32lriscv -nostdlib -norelax $< $(LIBOBJECTS) -L$(RVTOOLCHAIN_LIB_DIR) -lm $(RVTOOLCHAIN_GCC_LIB_DIR)/libgcc.a  -o $@
+	$(RVOBJDUMP) -Mnumeric -D $@ > $@.list
+
+%.PROGROM.hex: %.pipeline.elf $(FIRMWARE_DIR)/TOOLS/firmware_words 
+	$(FIRMWARE_DIR)/TOOLS/firmware_words $< -ram 0x20000 -max_addr 0x20000 -out $@ -from_addr 0 -to_addr 0xFFFF
+	cp $@ ../PROGROM.hex
+	mkdir -p ../obj_dir
+	cp $@ ../obj_dir/PROGROM.hex
+
+%.DATARAM.hex: %.pipeline.elf $(FIRMWARE_DIR)/TOOLS/firmware_words 
+	$(FIRMWARE_DIR)/TOOLS/firmware_words $< -ram 0x20000 -max_addr 0x20000 -out $@ -from_addr 0x10000 -to_addr 0x1FFFF
+	cp $@ ../DATARAM.hex
+	mkdir -p ../obj_dir
+	cp $@ ../obj_dir/DATARAM.hex
+
+%.pipeline.hex: %.PROGROM.hex %.DATARAM.hex
+	echo $@ > ../firmware.txt
+
--- a/FIRMWARE/PRECOMPILED/RV32I/COREMARK/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/COREMARK/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32I/COREMARK/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/COREMARK/PROGROM.hex
--- a/FIRMWARE/PRECOMPILED/RV32I/DHRYSTONES/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/DHRYSTONES/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32I/DHRYSTONES/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/DHRYSTONES/PROGROM.hex
--- a/FIRMWARE/PRECOMPILED/RV32I/RAYSTONES/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/RAYSTONES/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32I/RAYSTONES/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32I/RAYSTONES/PROGROM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/COREMARK/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/COREMARK/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/COREMARK/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/COREMARK/PROGROM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/DHRYSTONES/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/DHRYSTONES/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/DHRYSTONES/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/DHRYSTONES/PROGROM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/RAYSTONES/DATARAM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/RAYSTONES/DATARAM.hex
--- a/FIRMWARE/PRECOMPILED/RV32IM/RAYSTONES/PROGROM.hex
+++ b/FIRMWARE/PRECOMPILED/RV32IM/RAYSTONES/PROGROM.hex
--- a/FIRMWARE/ST_NICCC.c
+++ b/FIRMWARE/ST_NICCC.c
@@ -0,0 +1,480 @@
+/*
+ * Reading the ST-NICCC megademo data stored in
+ * the SPI flash and streaming it to polygons,
+ * rendered as ANSI character sequences through
+ * the UART.
+ * 
+ * The polygon stream is a 640K file, that needs
+ * to be stored in the SPI flash, using:
+ * ICEStick: iceprog -o 1M EXAMPLES/DATA/scene1.dat
+ * ULX3S:    cp EXAMPLES/DATA/scene1.dat scene1.img
+ *           ujprog -j flash -f 1048576 scene1.img
+ *   (using latest version of ujprog compiled from https://github.com/kost/fujprog)
+ *
+ * More details and links in EXAMPLES/DATA/notes.txt
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __linux__
+#include <stdlib.h>
+#include <unistd.h>
+#else
+#include "io.h"
+#endif
+
+// when compiling for SPI flash, uncomment to fit some routines in fast BRAM
+// (but it does not change much, the bottleneck is ANSI RGB encoding and uart.
+//#define RV32_FASTCODE __attribute((section(".fastcode")))
+#define RV32_FASTCODE
+
+// when compiling for SPI flash, uncomment to enable wireframe mode (but it is ugly
+// and it will not fit in BRAM !)
+// #define WITH_WIREFRAME 
+
+#ifdef WITH_WIREFRAME
+int wireframe = 0;
+#endif
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+
+
+/**********************************************************************************/
+/* Graphics routines                                                              */
+/**********************************************************************************/
+
+
+// Map coordinates from file to screen
+
+static inline uint8_t map_x(uint8_t x) {
+    return x >> 1;
+}
+
+static inline uint8_t map_y(uint8_t y) {
+    return y >> 2;
+}
+
+void GL_clear() {
+    printf("\033[48;5;16m"   // set background color black
+           "\033[2J");       // clear screen
+}
+
+/* 
+ * Set background color using 6x6x6 colorcube codes
+ * see https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences
+ */
+static inline void GL_setcolor(int color) {
+   static int last_color = -1;
+   if(color != last_color) {
+      printf("\033[48;5;%dm",color);
+   }
+   last_color = color;
+}
+
+static inline void GL_setpixel(int x, int y) {
+   printf("\033[%d;%dH ",y,x); // Goto_XY(x1,y) and print space
+}
+
+#ifdef WITH_WIREFRAME
+void GL_line(int x1, int y1, int x2, int y2) RV32_FASTCODE;
+void GL_line(int x1, int y1, int x2, int y2) {
+    int x,y,dx,dy,sy,tmp;
+
+    // Swap both extremities to ensure x increases
+    if(x2 < x1) {
+       tmp = x2;
+       x2 = x1;
+       x1 = tmp;
+       tmp = y2;
+       y2 = y1;
+       y1 = tmp;
+    }
+   
+    // Bresenham line drawing.
+    dy = y2 - y1;
+    sy = 1;
+    if(dy < 0) {
+	sy = -1;
+	dy = -dy;
+    }
+
+    dx = x2 - x1;
+   
+    x = x1;
+    y = y1;
+    
+    if(dy > dx) {
+	int ex = (dx << 1) - dy;
+	for(int u=0; u<dy; u++) {
+	    GL_setpixel(x,y);
+	    y += sy;
+	    if(ex >= 0)  {
+		x++;
+		ex -= dy << 1;
+		GL_setpixel(x,y);
+	    }
+	    while(ex >= 0)  {
+		x++;
+		ex -= dy << 1;
+	        putchar(' ');
+	    }
+	    ex += dx << 1;
+	}
+    } else {
+	int ey = (dy << 1) - dx;
+	for(int u=0; u<dx; u++) {
+	    GL_setpixel(x,y);
+	    x++;
+	    while(ey >= 0) {
+		y += sy;
+		ey -= dx << 1;
+		GL_setpixel(x,y);
+	    }
+	    ey += dy << 1;
+	}
+    }
+}
+#endif
+
+void GL_fillpoly(int nb_pts, int* points) RV32_FASTCODE;
+void GL_fillpoly(int nb_pts, int* points) {
+    static int last_color = -1;
+   
+    char x_left[128];
+    char x_right[128];
+
+    /* Determine clockwise, miny, maxy */
+    int clockwise = 0;
+    int miny =  256;
+    int maxy = -256;
+    
+    for(int i1=0; i1<nb_pts; ++i1) {
+	int i2=(i1==nb_pts-1) ? 0 : i1+1;
+	int i3=(i2==nb_pts-1) ? 0 : i2+1;
+	int x1 = points[2*i1];
+	int y1 = points[2*i1+1];
+	int dx1 = points[2*i2]   - x1;
+	int dy1 = points[2*i2+1] - y1;
+	int dx2 = points[2*i3]   - x1;
+	int dy2 = points[2*i3+1] - y1;
+	clockwise += dx1 * dy2 - dx2 * dy1;
+	miny = MIN(miny,y1);
+	maxy = MAX(maxy,y1);
+    }
+
+    /* Determine x_left and x_right for each scaline */
+    for(int i1=0; i1<nb_pts; ++i1) {
+	int i2=(i1==nb_pts-1) ? 0 : i1+1;
+
+	int x1 = points[2*i1];
+	int y1 = points[2*i1+1];
+	int x2 = points[2*i2];
+	int y2 = points[2*i2+1];
+
+#ifdef WITH_WIREFRAME
+        if(wireframe) {
+	   if((clockwise > 0) ^ (y2 > y1)) {
+	      GL_line(x1,y1,x2,y2);
+	   }
+	    continue;
+	}
+#endif
+       
+	char* x_buffer = ((clockwise > 0) ^ (y2 > y1)) ? x_left : x_right;
+	int dx = x2 - x1;
+	int sx = 1;
+	int dy = y2 - y1;
+	int sy = 1;
+	int x = x1;
+	int y = y1;
+	int ex;
+	
+	if(dx < 0) {
+	    sx = -1;
+	    dx = -dx;
+	}
+	
+	if(dy < 0) {
+	    sy = -1;
+	    dy = -dy;
+	}
+
+	if(y1 == y2) {
+	   x_left[y1]  = MIN(x1,x2);
+	   x_right[y1] = MAX(x1,x2);
+	   continue;
+	}
+
+	ex = (dx << 1) - dy;
+
+	for(int u=0; u <= dy; ++u) {
+    	    x_buffer[y] = x; 
+	    y += sy;
+	    while(ex >= 0) {
+		x += sx;
+		ex -= dy << 1;
+	    }
+	    ex += dx << 1;
+	}
+    }
+
+#ifdef WITH_WIREFRAME    
+    if(!wireframe) 
+#endif    
+    {
+	for(int y = miny; y <= maxy; ++y) {
+	    int x1 = x_left[y];
+	    int x2 = x_right[y];
+	    printf("\033[%d;%dH",y,x1); // Goto_XY(x1,y)
+	    for(int x=x1; x<x2; ++x) {
+		putchar(' ');
+	    }
+	}
+    }
+}
+
+
+/**********************************************************************************/
+
+/*
+ * Starting address of data stream stored in the 
+ * SPI.
+ * I put the data stream starting from 1M offset,
+ * just to make sure it does not collide with
+ * FPGA wiring configuration ! (but FPGA configuration
+ * only takes a few tenth of kilobytes I think).
+ * Using the IO interface, it is using the physical address
+ *  (starting at 1M). Using the mapped memory interface,
+ *  SPI_FLASH_BASE is mapped to 1M.
+ */
+uint32_t spi_addr = 0;
+
+/*
+ * Word address and cached word used in mapped mode
+ */
+uint32_t spi_word_addr = 0;
+union {
+  uint32_t spi_word;
+  uint8_t spi_bytes[4];
+} spi_u;
+
+#define ADDR_OFFSET 1024*1024
+
+/*
+ * Restarts reading from the beginning of the stream.
+ */
+void spi_reset() {
+  spi_addr = ADDR_OFFSET;
+  spi_word_addr = (uint32_t)(-1);
+}
+
+
+#ifdef __linux__
+
+FILE* f = NULL;
+
+/**
+ * Reads one byte of data from the file (emulates read_spi_byte() when running on desktop)
+ */
+uint8_t next_spi_byte() {
+   uint8_t result;
+   if(f == NULL) {
+      f = fopen("../../../FIRMWARE/EXAMPLES/DATA/scene1.dat","rb");
+      if(f == NULL) {
+	 printf("Could not open data file\n");
+	 exit(-1);
+      }
+   }
+   if(spi_word_addr != spi_addr >> 2) {
+      spi_word_addr = spi_addr >> 2;
+      fseek(f, spi_word_addr*4-ADDR_OFFSET, SEEK_SET);
+      fread(&(spi_u.spi_word), 4, 1, f);
+   }
+   result = spi_u.spi_bytes[spi_addr&3];
+   ++spi_addr;
+   return (uint8_t)(result);
+}
+
+#else
+
+
+# define SPI_FLASH_BASE ((uint32_t*)(1 << 23))
+
+/**
+ * Reads one byte from the SPI flash, using the mapped SPI flash interface.
+ */
+static inline uint8_t next_spi_byte() {
+   uint8_t result;
+   if(spi_word_addr != spi_addr >> 2) {
+      spi_word_addr = spi_addr >> 2;
+      spi_u.spi_word = SPI_FLASH_BASE[spi_word_addr];
+   }
+   result = spi_u.spi_bytes[spi_addr&3];
+   ++spi_addr;
+   return (uint8_t)(result);
+}
+
+#endif
+
+static inline uint16_t next_spi_word() {
+   /* In the ST-NICCC file,  
+    * words are stored in big endian format.
+    * (see DATA/scene_description.txt).
+    */
+   uint16_t hi = (uint16_t)next_spi_byte();    
+   uint16_t lo = (uint16_t)next_spi_byte();
+   return (hi << 8) | lo;
+}
+
+/* 
+ * The colormap, encoded in such a way that it
+ * can be directly sent as ANSI color codes.
+ */
+int cmap[16];
+
+/*
+ * Current frame's vertices coordinates (if frame is indexed),
+ *  mapped to OLED display dimensions (divide by 2 from file).
+ */
+uint8_t  X[255];
+uint8_t  Y[255];
+
+/*
+ * Current polygon vertices, as expected
+ * by GL_fillpoly():
+ * xi = poly[2*i], yi = poly[2*i+1]
+ */
+int      poly[30];
+
+/*
+ * Masks for frame flags.
+ */
+#define CLEAR_BIT   1
+#define PALETTE_BIT 2
+#define INDEXED_BIT 4
+
+/*
+ * Reads a frame's polygonal description from
+ * SPI flash and rasterizes the polygons using
+ * FemtoGL.
+ * returns 0 if last frame.
+ *   See DATA/scene_description.txt for the 
+ * ST-NICCC file format.
+ *   See DATA/test_ST_NICCC.c for an example
+ * program.
+ */
+int read_frame() RV32_FASTCODE;
+int read_frame() {
+    uint8_t frame_flags = next_spi_byte();
+
+    // Update palette data.
+    if(frame_flags & PALETTE_BIT) {
+	uint16_t colors = next_spi_word();
+	for(int b=15; b>=0; --b) {
+	    if(colors & (1 << b)) {
+		int rgb = next_spi_word();
+	       
+		// Get the three 3-bits per component R,G,B
+	        int b3 = (rgb & 0x007);
+		int g3 = (rgb & 0x070) >> 4;
+		int r3 = (rgb & 0x700) >> 8;
+
+		// Re-encode them as ANSI 8-bits color
+		b3 = b3 * 6 / 8;
+		g3 = g3 * 6 / 8;
+		r3 = r3 * 6 / 8;		
+		cmap[15-b] = 16 + b3 + 6*(g3 + 6*r3);
+	    }
+	}
+    }
+
+    if(frame_flags & CLEAR_BIT) {
+       // GL_clear(); 
+    }
+
+    // Update vertices
+    if(frame_flags & INDEXED_BIT) {
+	uint8_t nb_vertices = next_spi_byte();
+	for(int v=0; v<nb_vertices; ++v) {
+	    X[v] = map_x(next_spi_byte());
+	    Y[v] = map_y(next_spi_byte());
+	}
+    }
+
+    // Draw frame's polygons
+    for(;;) {
+	uint8_t poly_desc = next_spi_byte();
+
+	// Special polygon codes (end of frame,
+	// seek next block, end of stream)
+	
+	if(poly_desc == 0xff) {
+	    break; // end of frame
+	}
+	if(poly_desc == 0xfe) {
+	    // Go to next 64kb block
+	    spi_addr -= ADDR_OFFSET;
+	    spi_addr &= ~65535;
+	    spi_addr +=  65536;
+	    spi_addr += ADDR_OFFSET;
+	    return 1; 
+	}
+	if(poly_desc == 0xfd) {
+	    return 0; // end of stream
+	}
+	
+	uint8_t nvrtx = poly_desc & 15;
+	uint8_t poly_col = poly_desc >> 4;
+	for(int i=0; i<nvrtx; ++i) {
+	    if(frame_flags & INDEXED_BIT) {
+		uint8_t index = next_spi_byte();
+		poly[2*i]   = X[index];
+		poly[2*i+1] = Y[index];
+	    } else {
+		poly[2*i]   = map_x(next_spi_byte());
+		poly[2*i+1] = map_y(next_spi_byte());
+	    }
+	}
+        GL_setcolor(cmap[poly_col]);
+	GL_fillpoly(nvrtx,poly);
+    }
+    return 1; 
+}
+
+
+int main() {
+    // printf("\x1B[?25l"); // hide cursor
+
+#ifndef __linux__   
+    IO_OUT(IO_LEDS,15);
+#endif
+    printf("starting\n");
+
+#ifdef WITH_WIREFRAME    
+     wireframe = 0;
+#endif     
+    int frame = 0;
+    GL_clear();
+    for(;;) {
+        spi_reset();
+        frame = 0;
+	while(read_frame()) {
+#ifdef WITH_WIREFRAME    
+	   if(wireframe) {
+	      GL_clear();
+	   }
+#endif
+#ifdef __linux__       
+        usleep(20000);
+#else
+        IO_OUT(IO_LEDS,frame);
+#endif	   
+        ++frame;
+	}
+#ifdef WITH_WIREFRAME    
+        wireframe = !wireframe;
+#endif        
+    }
+}
--- a/FIRMWARE/blinker.S
+++ b/FIRMWARE/blinker.S
@@ -0,0 +1,21 @@
+# Simple blinker
+
+.equ IO_BASE, 0x400000  
+.equ IO_LEDS, 4
+
+.section .text
+
+.globl main
+
+main:
+.L0:
+	
+	li   t0, 5
+	sw   t0, IO_LEDS(gp)
+	call wait
+	li   t0, 10
+	sw   t0, IO_LEDS(gp)
+	call wait
+	j .L0
+
+
--- a/FIRMWARE/bram.ld
+++ b/FIRMWARE/bram.ld
@@ -0,0 +1,13 @@
+MEMORY
+{
+   BRAM (RWX) : ORIGIN = 0x0000, LENGTH = 0x1800  /* 6kB RAM */
+}
+SECTIONS
+{
+    everything :
+    {
+	. = ALIGN(4);
+	start.o (.text)
+        *(.*)
+    } >BRAM
+}
--- a/FIRMWARE/dhrystones.c
+++ b/FIRMWARE/dhrystones.c
@@ -0,0 +1,7 @@
+#define RISCV
+#define TIME
+#define USE_MYSTDLIB
+
+#include "DHRYSTONE/dhry_1.c"
+#include "DHRYSTONE/dhry_2.c"
+#include "DHRYSTONE/stubs.c"
--- a/FIRMWARE/donut.c
+++ b/FIRMWARE/donut.c
@@ -0,0 +1,182 @@
+// donut.c by Andy Sloane (@a1k0n)
+// https://gist.github.com/a1k0n/8ea6516b4946ab36348fb61703dc3194
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+#define WITH_RV32M
+
+#define debug(...)
+//#define debug printf
+
+// torus radii and distance from camera
+// these are pretty baked-in to other constants now, so it probably won't work
+// if you change them too much.
+const int dz = 5, r1 = 1, r2 = 2;
+
+// "Magic circle algorithm"? DDA? I've seen this formulation in a few places;
+// first in Hal Chamberlain's Musical Applications of Microprocessors, but not
+// sure what to call it, or how to justify it theoretically. It seems to
+// correctly rotate around a point "near" the origin, without losing magnitude
+// over long periods of time, as long as there are enough bits of precision in x
+// and y. I use 14 bits here.
+#define R(s,x,y) x-=(y>>s); y+=(x>>s)
+
+// CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto
+// the x axis. This also brings vector (x2,y2) along for the ride, and writes
+// back to x2 -- this is used to rotate the lighting vector from the normal of
+// the torus surface towards the camera, and thus determine the lighting amount.
+// We only need to keep one of the two lighting normal coordinates.
+int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
+  int x2 = *x2_;
+  if (x < 0) { // start in right half-plane
+    x = -x;
+    x2 = -x2;
+  }
+  for (int i = 0; i < 8; i++) {
+    int t = x;
+    int t2 = x2;
+    if (y < 0) {
+      x -= y >> i;
+      y += t >> i;
+      x2 -= y2 >> i;
+      y2 += t2 >> i;
+    } else {
+      x += y >> i;
+      y -= t >> i;
+      x2 += y2 >> i;
+      y2 -= t2 >> i;
+    }
+  }
+  // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor
+  // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC)
+  *x2_ = (x2 >> 1) + (x2 >> 3);
+  return (x >> 1) + (x >> 3);
+}
+
+void main() {
+  // high-precision rotation directions, sines and cosines and their products
+  int16_t sB = 0, cB = 16384;
+  int16_t sA = 11583, cA = 11583;
+  int16_t sAsB = 0, cAsB = 0;
+  int16_t sAcB = 11583, cAcB = 11583;
+
+  for (;;) {
+    int x1_16 = cAcB << 2;
+
+    // yes this is a multiply but dz is 5 so it's (sb + (sb<<2)) >> 6 effectively
+    int p0x = dz * sB >> 6;
+    int p0y = dz * sAcB >> 6;
+    int p0z = -dz * cAcB >> 6;
+
+    const int r1i = r1*256;
+    const int r2i = r2*256;
+
+    int niters = 0;
+    int nnormals = 0;
+    int16_t yincC = (cA >> 6) + (cA >> 5);      // 12*cA >> 8;
+    int16_t yincS = (sA >> 6) + (sA >> 5);      // 12*sA >> 8;
+    int16_t xincX = (cB >> 7) + (cB >> 6);      // 6*cB >> 8;
+    int16_t xincY = (sAsB >> 7) + (sAsB >> 6);  // 6*sAsB >> 8;
+    int16_t xincZ = (cAsB >> 7) + (cAsB >> 6);  // 6*cAsB >> 8;
+    int16_t ycA = -((cA >> 1) + (cA >> 4));     // -12 * yinc1 = -9*cA >> 4;
+    int16_t ysA = -((sA >> 1) + (sA >> 4));     // -12 * yinc2 = -9*sA >> 4;
+    //int dmin = INT_MAX, dmax = -INT_MAX;
+    for (int j = 0; j < 23; j++, ycA += yincC, ysA += yincS) {
+      int xsAsB = (sAsB >> 4) - sAsB;  // -40*xincY
+      int xcAsB = (cAsB >> 4) - cAsB;  // -40*xincZ;
+
+      int16_t vxi14 = (cB >> 4) - cB - sB; // -40*xincX - sB;
+      int16_t vyi14 = ycA - xsAsB - sAcB;
+      int16_t vzi14 = ysA + xcAsB + cAcB;
+
+      for (int i = 0; i < 79; i++, vxi14 += xincX, vyi14 -= xincY, vzi14 += xincZ) {
+        int t = 512; // (256 * dz) - r2i - r1i;
+
+        int16_t px = p0x + (vxi14 >> 5); // assuming t = 512, t*vxi>>8 == vxi<<1
+        int16_t py = p0y + (vyi14 >> 5);
+        int16_t pz = p0z + (vzi14 >> 5);
+        debug("pxyz (%+4d,%+4d,%+4d)\n", px, py, pz);
+        int16_t lx0 = sB >> 2;
+        int16_t ly0 = sAcB - cA >> 2;
+        int16_t lz0 = -cAcB - sA >> 2;
+        for (;;) {
+          int t0, t1, t2, d;
+          int16_t lx = lx0, ly = ly0, lz = lz0;
+          debug("[%2d,%2d] (px, py) = (%d, %d), (lx, ly) = (%d, %d) -> ", j, i, px, py, lx, ly);
+          t0 = length_cordic(px, py, &lx, ly);
+          debug("t0=%d (lx', ly') = (%d, %d)\n", t0, lx, ly);
+          t1 = t0 - r2i;
+          t2 = length_cordic(pz, t1, &lz, lx);
+          d = t2 - r1i;
+          t += d;
+
+          if (t > 8*256) {
+            putchar(' ');
+            break;
+          } else if (d < 2) {
+            int N = lz >> 9;
+            putchar(".,-~:;!*=#$@"[N > 0 ? N < 12 ? N : 11 : 0]);
+            nnormals++;
+            break;
+          }
+          // todo: shift and add version of this
+
+	   
+          /*
+            if (d < dmin) dmin = d;
+            if (d > dmax) dmax = d;
+	   */
+
+#ifdef WITH_RV32M	   
+            px += d*vxi14 >> 14;
+            py += d*vyi14 >> 14;
+            pz += d*vzi14 >> 14;
+#else
+          {
+            // 11x1.14 fixed point 3x parallel multiply
+            // only 16 bit registers needed; starts from highest bit to lowest
+            // d is about 2..1100, so 11 bits are sufficient
+            int16_t dx = 0, dy = 0, dz = 0;
+            int16_t a = vxi14, b = vyi14, c = vzi14;
+            while (d) {
+              if (d&1024) {
+                dx += a;
+                dy += b;
+                dz += c;
+              }
+              d = (d&1023) << 1;
+              a >>= 1;
+              b >>= 1;
+              c >>= 1;
+            }
+            // we already shifted down 10 bits, so get the last four
+            px += dx >> 4;
+            py += dy >> 4;
+            pz += dz >> 4;
+          }
+#endif
+          niters++;
+        }
+      }
+      puts("");
+    }
+    printf("%d iterations %d lit pixels\x1b[K", niters, nnormals);
+//    fflush(stdout);
+
+    // rotate sines, cosines, and products thereof
+    // this animates the torus rotation about two axes
+    R(5, cA, sA);
+    R(5, cAsB, sAsB);
+    R(5, cAcB, sAcB);
+    R(6, cB, sB);
+    R(6, cAcB, cAsB);
+    R(6, sAcB, sAsB);
+
+//    usleep(15000);
+    printf("\r\x1b[23A");
+  }
+}
--- a/FIRMWARE/donut2.c
+++ b/FIRMWARE/donut2.c
@@ -0,0 +1,427 @@
+// donut.c by Andy Sloane (@a1k0n)
+// https://gist.github.com/a1k0n/8ea6516b4946ab36348fb61703dc3194
+// Bruno Levy: added ANSI "pseudo-graphics", and RISC-V statistics
+
+#define CPU_NAME "TordBoyau ULX3S" // Name of your CPU and FPGA board
+#define MHZ 95                     // Frequency (without a timer we cannot guess)
+#define USE_MUL                    // Define if you support RV32M 
+
+// #define PRECISE // Define for a more accurate result (but it costs a bit)
+#define START_FRAMES 20 // Number of frames without display
+                        // (for accurate CPI/MIPS measurements)
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <math.h>
+
+// 0 15 31 47 63 79 96 112 127 143 159 175 191 207 223 240 255
+ 
+const char* colormap[34] = {
+    "0",
+    "8;5;232",
+    "8;5;233",
+    "8;5;234",
+    "8;5;235",
+    "8;5;236",
+    "8;5;237",
+    "8;5;238",
+    "8;5;239",
+    "8;5;240",
+    "8;5;241",
+    "8;5;242",
+    "8;5;243",
+    "8;5;244",
+    "8;5;245",
+    "8;5;246",
+    "8;5;247",
+    "8;5;248",
+    "8;5;249",
+    "8;5;250",
+    "8;5;251",
+    "8;5;252",
+    "8;5;253",
+    "8;5;254",
+    "8;5;255",
+    "7",
+    "8;5;16",
+    "8;5;17",
+    "8;5;18",
+    "8;5;19",
+    "8;5;20",
+    "8;5;21",
+    "8;5;22",
+    "8;5;23", 
+};
+
+int prev_color1=0;
+int prev_color2=0;
+
+char scanline[80];
+
+#ifdef __linux__
+
+uint64_t my_rdcycle() {
+    return 0;
+}
+
+uint64_t my_rdinstret() {
+    return 0;
+}
+
+#else
+
+uint64_t my_rdcycle() {
+    uint64_t result;
+    uint32_t a0,a1,t0;
+    {
+        __asm__ __volatile__ ("rdcycleh %0" : "=r" (a1));
+        __asm__ __volatile__ ("rdcycle %0" : "=r" (a0));
+        __asm__ __volatile__ ("rdcycleh %0" : "=r" (t0));
+    } while(t0 != a1);
+    
+    return ((uint64_t)a1 << 32) | a0;
+}
+
+uint64_t my_rdinstret() {
+    uint64_t result;
+    uint32_t a0,a1,t0;
+    {
+        __asm__ __volatile__ ("rdinstreth %0" : "=r" (a1));
+        __asm__ __volatile__ ("rdinstret %0" : "=r" (a0));
+        __asm__ __volatile__ ("rdinstreth %0" : "=r" (t0));
+    } while(t0 != a1);
+    
+    return ((uint64_t)a1 << 32) | a0;
+}
+
+#endif
+
+uint64_t stats_cycles_init = 0;
+uint64_t stats_instructions_init = 0;
+uint64_t stats_cycles = 0;
+uint64_t stats_instructions = 0;
+int stats_CPI_times_1000 = 0;
+
+void stats_start() {
+    stats_cycles_init       = my_rdcycle();
+    stats_instructions_init = my_rdinstret();
+}
+
+void stats_end() {
+    stats_cycles       = my_rdcycle() - stats_cycles_init;
+    stats_instructions = my_rdinstret() - stats_instructions_init;
+    if(stats_cycles==0) {
+        stats_cycles++;
+    }
+    if(stats_instructions==0) {
+        stats_instructions++;
+    }
+    stats_CPI_times_1000 = (int)((stats_cycles * 1000)/stats_instructions);
+}
+
+// Print "fixed point" number (integer/1000)
+static void printk(uint64_t kx) {
+    int intpart  = (int)(kx / 1000);
+    int fracpart = (int)(kx % 1000);
+    printf("%d.",intpart);
+    if(fracpart<100) {
+	printf("0");
+    }
+    if(fracpart<10) {
+	printf("0");
+    }
+    printf("%d",fracpart);
+}
+
+static inline void setcolors(int fg, int bg) {
+    printf("\033[4%s;3%sm",colormap[bg],colormap[fg]);
+}
+
+static inline void setpixel(int x, int y, int color) {
+    if(y&1){
+        int color1 = scanline[x];
+        int color2 = color;
+        if(color1 == color2) {
+            if(prev_color1 == color1) {
+                putchar(' ');
+            } else {
+                printf("\033[4%sm ",colormap[color1]);
+                prev_color1 = color1;
+            }
+        } else {
+            if(prev_color1 != color1 && prev_color2 != color2) {
+                printf("\033[4%s;3%sm",colormap[color1],colormap[color2]);
+                prev_color1 = color1;
+                prev_color2 = color2;
+            } else if(prev_color1 != color1) {
+                printf("\033[4%sm",colormap[color1]);
+                prev_color1 = color1;
+            } else if(prev_color2 != color2) {
+                printf("\033[3%sm",colormap[color2]);
+                prev_color2 = color2;
+            }
+            printf("\u2583");
+        }
+    } else {
+        scanline[x] = color;
+    }
+}
+
+#define debug(...)
+//#define debug printf
+
+// torus radii and distance from camera
+// these are pretty baked-in to other constants now, so it probably won't work
+// if you change them too much.
+const int dz = 5, r1 = 1, r2 = 2;
+
+// "Magic circle algorithm"? DDA? I've seen this formulation in a few places;
+// first in Hal Chamberlain's Musical Applications of Microprocessors, but not
+// sure what to call it, or how to justify it theoretically. It seems to
+// correctly rotate around a point "near" the origin, without losing magnitude
+// over long periods of time, as long as there are enough bits of precision in x
+// and y. I use 14 bits here.
+#define R(s,x,y) x-=(y>>s); y+=(x>>s)
+
+// CORDIC algorithm to find magnitude of |x,y| by rotating the x,y vector onto
+// the x axis. This also brings vector (x2,y2) along for the ride, and writes
+// back to x2 -- this is used to rotate the lighting vector from the normal of
+// the torus surface towards the camera, and thus determine the lighting amount.
+// We only need to keep one of the two lighting normal coordinates.
+int length_cordic(int16_t x, int16_t y, int16_t *x2_, int16_t y2) {
+
+#ifdef PRECISE
+   #define NIT 10
+#else
+   #define NIT 5
+#endif
+   
+  int x2 = *x2_;
+  if (x < 0) { // start in right half-plane
+    x = -x;
+    x2 = -x2;
+  }
+  for (int i = 0; i<NIT; i++) {
+    int t = x;
+    int t2 = x2;
+    if (y < 0) {
+      x -= y >> i;
+      y += t >> i;
+      x2 -= y2 >> i;
+      y2 += t2 >> i;
+    } else {
+      x += y >> i;
+      y -= t >> i;
+      x2 += y2 >> i;
+      y2 -= t2 >> i;
+    }
+  }
+  // divide by 0.625 as a cheap approximation to the 0.607 scaling factor factor
+  // introduced by this algorithm (see https://en.wikipedia.org/wiki/CORDIC)
+  *x2_ = (x2 >> 1) + (x2 >> 3);
+  return (x >> 1) + (x >> 3)
+     #ifdef PRECISE
+         - (x >> 6) // get nrearer to 0.607 [Inigo Quilez]
+     #endif
+       ; 
+}
+
+int main() {
+
+   printf( "\033[48;5;16m"   // set background color black
+	   "\033[38;5;15m"   // set foreground color white	   
+	   "\033[H"          // home
+	   "\033[?25l"       // hide cursor
+           "\033[2J");       // clear screen
+
+  int frame = 0;
+   
+  // high-precision rotation directions, sines and cosines and their products
+  int16_t sB = 0, cB = 16384;
+  int16_t sA = 11583, cA = 11583;
+  int16_t sAsB = 0, cAsB = 0;
+  int16_t sAcB = 11583, cAcB = 11583;
+
+  int accurate_CPI_x_1000;
+  int accurate_MIPS_x_1000;
+  int CPI_x_1000;
+
+  stats_start();
+  
+  for (;;) {
+
+    int display_on = (frame > START_FRAMES);
+    if(display_on) {
+        stats_start();
+    }
+    
+    int x1_16 = cAcB << 2;
+
+    // yes this is a multiply but dz is 5 so it's (sb + (sb<<2)) >> 6 effectively
+    int p0x = dz * sB >> 6;
+    int p0y = dz * sAcB >> 6;
+    int p0z = -dz * cAcB >> 6;
+
+    const int r1i = r1*256;
+    const int r2i = r2*256;
+
+    int niters = 0;
+    int nnormals = 0;
+    int16_t yincC = (cA >> 6) + (cA >> 5);      // 12*cA >> 8;
+    int16_t yincS = (sA >> 6) + (sA >> 5);      // 12*sA >> 8;
+    int16_t xincX = (cB >> 7) + (cB >> 6);      // 6*cB >> 8;
+    int16_t xincY = (sAsB >> 7) + (sAsB >> 6);  // 6*sAsB >> 8;
+    int16_t xincZ = (cAsB >> 7) + (cAsB >> 6);  // 6*cAsB >> 8;
+    int16_t ycA = -((cA >> 1) + (cA >> 4));     // -12 * yinc1 = -9*cA >> 4;
+    int16_t ysA = -((sA >> 1) + (sA >> 4));     // -12 * yinc2 = -9*sA >> 4;
+    //int dmin = INT_MAX, dmax = -INT_MAX;
+
+    int xsAsB = (sAsB >> 4) - sAsB;  // -40*xincY
+    int xcAsB = (cAsB >> 4) - cAsB;  // -40*xincZ;
+     
+
+    for (int j = 0; j < 46; j++, ycA += yincC>>1, ysA += yincS>>1) {
+
+      int16_t vxi14 = (cB >> 4) - cB - sB; // -40*xincX - sB;
+      int16_t vyi14 = ycA - xsAsB - sAcB;
+      int16_t vzi14 = ysA + xcAsB + cAcB;
+
+      for (int i = 0; i < 79; i++, vxi14 += xincX, vyi14 -= xincY, vzi14 += xincZ) {
+        int t = 512; // (256 * dz) - r2i - r1i;
+
+        int16_t px = p0x + (vxi14 >> 5); // assuming t = 512, t*vxi>>8 == vxi<<1
+        int16_t py = p0y + (vyi14 >> 5);
+        int16_t pz = p0z + (vzi14 >> 5);
+        debug("pxyz (%+4d,%+4d,%+4d)\n", px, py, pz);
+        int16_t lx0 = sB >> 2;
+        int16_t ly0 = sAcB - cA >> 2;
+        int16_t lz0 = -cAcB - sA >> 2;
+        for (;;) {
+          int t0, t1, t2, d;
+          int16_t lx = lx0, ly = ly0, lz = lz0;
+          debug("[%2d,%2d] (px, py) = (%d, %d), (lx, ly) = (%d, %d) -> ", j, i, px, py, lx, ly);
+          t0 = length_cordic(px, py, &lx, ly);
+          debug("t0=%d (lx', ly') = (%d, %d)\n", t0, lx, ly);
+          t1 = t0 - r2i;
+          t2 = length_cordic(pz, t1, &lz, lx);
+          d = t2 - r1i;
+          t += d;
+
+          if (t > 8*256) {
+            // putchar(' ');
+            int N = (((j-frame)>>3)^(((i+frame)>>3)))&1;
+            if(display_on) setpixel(i,j,(N<<2)+26);
+            break;
+          } else if (d < 2) {
+            int N = lz >> 8;
+	    // putchar(".,-~:;!*=#$@"[N > 0 ? N < 12 ? N : 11 : 0]);
+            N = N > 0 ? N < 26 ? N : 25 : 0;
+	    if(display_on) setpixel(i,j,N);
+            nnormals++;
+            break;
+          }
+          // todo: shift and add version of this
+
+          /*
+            if (d < dmin) dmin = d;
+            if (d > dmax) dmax = d;
+	   */
+
+#ifdef USE_MUL	   
+	   px += d*vxi14 >> 14;
+	   py += d*vyi14 >> 14;
+	   pz += d*vzi14 >> 14;
+#else	   
+          {
+            // 11x1.14 fixed point 3x parallel multiply
+            // only 16 bit registers needed; starts from highest bit to lowest
+            // d is about 2..1100, so 11 bits are sufficient
+            int16_t dx = 0, dy = 0, dz = 0;
+            int16_t a = vxi14, b = vyi14, c = vzi14;
+            while (d) {
+              if (d&1024) {
+                dx += a;
+                dy += b;
+                dz += c;
+              }
+              d = (d&1023) << 1;
+              a >>= 1;
+              b >>= 1;
+              c >>= 1;
+            }
+            // we already shifted down 10 bits, so get the last four
+            px += dx >> 4;
+            py += dy >> 4;
+            pz += dz >> 4;
+          }
+#endif
+          niters++;
+        }
+      }
+      if(display_on && (j&1)) puts("");
+    }
+    if(display_on) printf("\033[0m"); // reset colors
+
+    stats_end();
+
+    if(frame == START_FRAMES) {
+        accurate_CPI_x_1000 = stats_CPI_times_1000;
+        accurate_MIPS_x_1000 = (MHZ * 1000000) / accurate_CPI_x_1000;
+    }
+
+    CPI_x_1000 = stats_CPI_times_1000;
+
+    uint64_t FPS_num   = (uint64_t)(MHZ) * 1000000 * 1000;
+    uint64_t FPS_denom = stats_cycles;
+    int FPSx1000 = (int)(FPS_num / FPS_denom);
+    
+    setcolors(25,33);    
+#ifdef USE_MUL
+    printf("%s RV32IM %dMHz ", CPU_NAME, MHZ);
+#else
+    printf("%s RV32I %dMHz ", CPU_NAME, MHZ);     
+#endif
+
+    setcolors(25,0);
+    printf(" "); printk(FPSx1000); printf(" FPS ");
+    setcolors(0,25);
+    printf(" "); printk(CPI_x_1000);
+    printf(" ("); printk(accurate_CPI_x_1000); printf(") CPI ");
+    setcolors(25,0);
+    printf(" "); printk(accurate_MIPS_x_1000); printf(" MIPS");
+    /*
+    setcolors(0,25);
+    printf(" %d iterations ", niters);
+    setcolors(0,25);
+    printf(" %d lit pixels ", nnormals);
+    */
+    setcolors(25,0);
+    printf("\x1b[K");
+    
+#ifdef __linux__    
+    fflush(stdout);
+#endif
+    
+    // rotate sines, cosines, and products thereof
+    // this animates the torus rotation about two axes
+    R(5, cA, sA);
+    R(5, cAsB, sAsB);
+    R(5, cAcB, sAcB);
+    R(6, cB, sB);
+    R(6, cAcB, cAsB);
+    R(6, sAcB, sAsB);
+
+#ifdef __linux__     
+    usleep(15000);
+#endif     
+    printf("\r\x1b[23A");
+    ++frame;
+    prev_color1=-1;
+    prev_color2=-1;
+  }
+
+  return 0;
+}
+
--- a/FIRMWARE/errno.c
+++ b/FIRMWARE/errno.c
@@ -0,0 +1,11 @@
+
+// Sometimes __errno is not linked, here is a dummy replacement.
+// Note that __errno is a function that returns a pointer to the
+// actual __errno (this is for multithreading). Made me bang my 
+// head to the wall (and made tinyraytracer crash because powf()
+// was *calling* __errno).
+
+int* __errno()  {
+   static int val = 0;
+   return &val;
+}
--- a/FIRMWARE/hello.S
+++ b/FIRMWARE/hello.S
@@ -0,0 +1,27 @@
+# Hello world !
+	
+.section .text
+.globl main
+
+main:
+.L0:
+	la   a0, hello
+	call putstring
+	j .L0
+
+putstring:
+	addi sp,sp,-4 # save ra on the stack
+	sw ra,0(sp)   # (need to do that for functions that call functions)
+	mv t2,a0	
+.L1:    lbu a0,0(t2)
+	beqz a0,.L2
+	call putchar
+	addi t2,t2,1	
+	j .L1
+.L2:    lw ra,0(sp)  # restore ra
+	addi sp,sp,4 # restore sp
+	ret
+
+.section .data
+hello:
+	.asciz "Hello, world !\n"
--- a/FIRMWARE/humanshader.c
+++ b/FIRMWARE/humanshader.c
@@ -0,0 +1,113 @@
+// C version of humanshader
+// See https://humanshader.com/
+// (using a computer is clearly not as fun, but it is interesting to have
+//  a small not too computationally expensive raytracing program that
+//  can run on small softcores for PGAs).
+// Using the 16-bits version with no divide from here: https://www.shadertoy.com/view/XflXDs
+
+#define GL_width  71
+#define GL_height 40
+#include "GL_tty.h"
+
+void human_shader(
+    int x, int y, uint8_t* r_out, uint8_t* g_out, uint8_t* b_out
+) {
+    int R, B;
+
+    //-------------------------    
+    // Section A (2 MUL, 3 ADD)
+    //-------------------------    
+    int u = x-36;
+    int v = 18-y;
+    int u2 = u*u;
+    int v2 = v*v;
+    int h = u2 + v2;
+    //-------------------------  
+    
+    if( h < 200 ) 
+    {
+        //-------------------------------------
+        // Section B, Sphere (4/7 MUL, 5/9 ADD)
+        //-------------------------------------
+        R = 420;
+        B = 520;
+
+        int t = 5200 + (h<<3);
+        int p = (t*u)>>7;
+        int q = (t*v)>>7;
+        
+        // bounce light
+        int w = 18 + (((p*5-q*13))>>9);
+        if( w>0 ) R += w*w;
+        
+        // sky light / ambient occlusion
+        int o = q + 900;
+        R = (R*o)>>12;
+        B = (B*o)>>12;
+
+        // sun/key light
+        if( p > -q )
+        {
+            int w = (p+q)>>3;
+            R += w;
+            B += w;
+        }
+        //-------------------------  
+	}
+    else if( v<0 )
+    {
+        //-------------------------------------
+        // Section C, Ground (5/9 MUL, 6/9 ADD)
+        //-------------------------------------
+        R = 150 + (v<<1);
+        B = 50;
+        
+        int p = h + (v2<<3);
+        int c = 240*(-v) - p;
+
+        // sky light / ambient occlusion
+        if( c>1200 )
+        {
+            int o = (25*c)>>3;
+            o = (c*(7840-o)>>9) - 8560;
+            R = (R*o)>>10;
+            B = (B*o)>>10;
+        }
+
+        // sun/key light with soft shadow
+        int r = c + u*v;
+        int d = 3200 - h - (r<<1);
+        if( d>0 ) R += d;
+        //-------------------------  
+    }
+    else
+    {
+        //------------------------------
+        // Section D, Sky (1 MUL, 2 ADD)
+        //------------------------------
+        int c = x + (y<<2);
+        R = 132 + c;
+        B = 192 + c;
+        //-------------------------  
+    }
+    
+    //-------------------------
+    // Section E (3 MUL, 1 ADD)
+    //-------------------------
+    if(R > 255) R = 255;
+    if(B > 255) B = 255;
+    
+    int G = (R*11 + 5*B)>>4;
+    //-------------------------  
+    
+    *r_out = (uint8_t)R;
+    *g_out = (uint8_t)G;
+    *b_out = (uint8_t)B;
+}
+
+int main() {
+    GL_init();
+    GL_scan_RGB(GL_width, GL_height, human_shader);
+    GL_terminate();
+    return 0;
+}
--- a/FIRMWARE/io.h
+++ b/FIRMWARE/io.h
@@ -0,0 +1,10 @@
+#include <stdint.h>
+
+#define IO_BASE       0x400000
+#define IO_LEDS       4
+#define IO_UART_DAT   8
+#define IO_UART_CNTL  16
+
+#define IO_IN(port)       *(volatile uint32_t*)(IO_BASE + port)
+#define IO_OUT(port,val)  *(volatile uint32_t*)(IO_BASE + port)=(val)
+
--- a/FIRMWARE/mandel_C.c
+++ b/FIRMWARE/mandel_C.c
@@ -0,0 +1,99 @@
+/*
+ Computes and displays the Mandelbrot set on the OLED display.
+*/
+
+#include <stdio.h>
+
+#ifdef __linux__
+#include <unistd.h>
+#else
+#include "io.h"
+#endif
+
+#define W 46
+#define H 46
+
+#define mandel_shift 10
+#define mandel_mul (1 << mandel_shift)
+#define xmin -2*mandel_mul
+#define ymax  2*mandel_mul
+#define ymin -2*mandel_mul
+#define xmax  2*mandel_mul
+#define dx (xmax-xmin)/H
+#define dy (ymax-ymin)/H
+#define norm_max (4 << mandel_shift)
+
+
+#define ANSIRGB(R,G,B) "\033[48;2;" #R ";"  #G ";" #B "m  "
+
+
+const char* colormap[21] = {
+   ANSIRGB( 0, 0,  0),
+   ANSIRGB( 0, 0, 40),
+   ANSIRGB( 0, 0, 80),
+   ANSIRGB( 0, 0,120),
+   ANSIRGB( 0, 0,160),
+   ANSIRGB( 0, 0,200),
+   ANSIRGB( 0, 0,240),
+   
+   ANSIRGB( 0,  0, 0),
+   ANSIRGB( 0, 40, 0),
+   ANSIRGB( 0, 80, 0),
+   ANSIRGB( 0,120, 0),
+   ANSIRGB( 0,160, 0),
+   ANSIRGB( 0,200, 0),
+   ANSIRGB( 0,240, 0),
+
+   ANSIRGB(   0, 0, 0),
+   ANSIRGB(  40, 0, 0),
+   ANSIRGB(  80, 0, 0),
+   ANSIRGB( 120, 0, 0),
+   ANSIRGB( 160, 0, 0),
+   ANSIRGB( 200, 0, 0),
+   ANSIRGB( 240, 0, 0)
+};
+
+int main() {
+   int frame=0;
+   for(;;) {
+      IO_OUT(IO_LEDS,frame);
+      int last_color = -1;
+      printf("\033[H");
+      int Ci = ymin;
+      for(int Y=0; Y<H; ++Y) {
+	 int Cr = xmin;
+	 for(int X=0; X<W; ++X) {
+	    int Zr = Cr;
+	    int Zi = Ci;
+	    int iter = 20;
+	    while(iter > 0) {
+	       int Zrr = (Zr * Zr) >> mandel_shift;
+	       int Zii = (Zi * Zi) >> mandel_shift;
+	       int Zri = (Zr * Zi) >> (mandel_shift - 1);
+	       Zr = Zrr - Zii + Cr;
+	       Zi = Zri + Ci;
+	       if(Zrr + Zii > norm_max) {
+		  break;
+	       }
+	       --iter;
+	    }
+	    int color = (iter+frame)%21;
+	    printf(color == last_color ? "  " : colormap[color]);
+	    last_color = color;
+	    Cr += dx;
+	 }
+	 Ci += dy;
+	 printf("\033[49m\n");	 
+	 last_color = -1;
+      }
+      ++frame;
+#ifdef __linux__       
+        usleep(100000);
+#endif
+//      if(frame>4) break;
+   }
+   
+}
+
+
+
--- a/FIRMWARE/mandelbrot.S
+++ b/FIRMWARE/mandelbrot.S
@@ -0,0 +1,125 @@
+# Computes and displays the Mandelbrot set on the terminal.
+# Needs NRV_IO_UART to be enabled. 
+#
+# To access it, use:
+#   miniterm.py --dtr=0 /dev/ttyUSB1 115200
+#   or screen /dev/ttyUSB1 115200 (<ctrl> a \ to exit)
+
+
+# Base address of memory-mapped IO,
+# Loaded into gp at startup
+.equ IO_BASE, 0x400000  
+
+# IO-reg offsets. To read or write one of them,
+# use IO_XXX(gp)
+.equ IO_LEDS, 4
+.equ IO_UART_DAT, 8
+.equ IO_UART_CNTL, 16
+
+.equ mandel_shift, 10
+.equ mandel_mul,(1 << mandel_shift)	
+.equ xmin, -2*mandel_mul
+.equ xmax,  2*mandel_mul
+.equ ymin, -2*mandel_mul
+.equ ymax,  2*mandel_mul	
+.equ dx, (xmax-xmin)/80
+.equ dy, (ymax-ymin)/80
+.equ norm_max,(4 << mandel_shift)
+
+.section .text
+
+# X,Y         : s0,s1
+# Cr,Ci       : s2,s3
+# Zr,Zi       : s4,s5
+# Zrr,2Zri,Zii: s6,s7,s8
+# cnt: s10
+# 128: s11
+
+.globl main
+
+main:
+mandelstart:
+	
+	li   t0, 5
+	sw   t0, IO_LEDS(gp)
+	call wait
+	li   t0, 10
+	sw   t0, IO_LEDS(gp)
+	call wait
+	li   t0, 5
+	sw   t0, IO_LEDS(gp)
+	li   t0, 10
+	sw   t0, IO_LEDS(gp)
+	call wait
+	li   t0, 0
+	sw   t0, IO_LEDS(gp)
+
+
+	li s1,0
+	li s3,xmin
+	li s11,80
+
+loop_y:	li s0,0
+        li s2,ymin
+	
+loop_x: mv s4,s2    # Z <- C
+        mv s5,s3
+	
+	li s10,9   # iter <- 9
+	
+loop_Z: mv a0,s4    # Zrr  <- (Zr*Zr) >> mandel_shift
+        mv a1,s4
+	call __mulsi3
+	srli s6,a0,mandel_shift
+	mv a0,s4    # Zri <- (Zr*Zi) >> (mandel_shift-1)
+	mv a1,s5
+	call __mulsi3
+	srai s7,a0,mandel_shift-1
+	mv a0,s5    # Zii <- (Zi*Zi) >> (mandel_shift)
+	mv a1,s5
+	call __mulsi3
+	srli s8,a0,mandel_shift
+	sub s4,s6,s8 # Zr <- Zrr - Zii + Cr  
+	add s4,s4,s2
+        add s5,s7,s3 # Zi <- 2Zri + Cr
+
+        add s6,s6,s8     # if norm > norm max, exit loop
+        li  s7,norm_max
+	bgt s6,s7,exit_Z
+
+        add s10,s10,-1   # iter--, loop if non-zero
+	bnez s10, loop_Z
+exit_Z:
+        la  a0,colormap
+	add a0,a0,s10
+	lbu a0,0(a0)
+	call putchar
+	
+	add s0,s0,1
+	add s2,s2,dx
+	bne s0,s11,loop_x
+
+        li a0,13
+	call putchar
+        li a0,10
+	call putchar
+
+	add s1,s1,1
+	add s3,s3,dy
+	bne s1,s11,loop_y
+	
+	li   t0, 15
+	sw   t0, IO_LEDS(gp)
+
+	call putchar
+        li a0,13
+	call putchar
+        li a0,10
+	call putchar
+	
+        j mandelstart
+
+.section .data
+colormap:
+.ascii " .,:;ox%#@"
+
--- a/FIRMWARE/memcpy.c
+++ b/FIRMWARE/memcpy.c
@@ -0,0 +1,27 @@
+#include <stddef.h>
+#include <stdint.h>
+
+#pragma GCC optimize ("no-tree-loop-distribute-patterns")
+
+void* memcpy(void * dst, void const * src, size_t len) {
+   uint32_t * plDst = (uint32_t *) dst;
+   uint32_t const * plSrc = (uint32_t const *) src;
+
+   // If source and destination are aligned,
+   // copy 32s bit by 32 bits.
+   if (!((uint32_t)src & 3) && !((uint32_t)dst & 3)) {
+      while (len >= 4) {
+	 *plDst++ = *plSrc++;
+	 len -= 4;
+      }
+   }
+
+   uint8_t* pcDst = (uint8_t *) plDst;
+   uint8_t const* pcSrc = (uint8_t const *) plSrc;
+   
+   while (len--) {
+      *pcDst++ = *pcSrc++;
+   }
+   
+   return dst;
+}
--- a/FIRMWARE/notes.txt
+++ b/FIRMWARE/notes.txt
@@ -0,0 +1,26 @@
+# https://blog.thea.codes/the-most-thoroughly-commented-linker-script/
+# https://interrupt.memfault.com/blog/how-to-write-linker-scripts-for-firmware
+
+bin/riscv64-unknown-elf-as -march=rv32i -mabi=ilp32 -mno-relax mandelbrot_terminal.S -o mandelbrot_terminal.o
+riscv64-unknown-elf-ld mandelbrot_terminal.o -T baremetal.ld -m elf32lriscv -nostdlib -norelax
+/home/blevy/Programming/learn-fpga/FemtoRV/FIRMWARE/TOOLS/firmware_words a.elf -ram 6144 -hex a.hex
+
+
+FTDI 2232H
+
+Rx   >
+Tx   <
+RTSn > Request to send 
+CTSn < Clear to send 
+DTRn > Data Terminal Ready
+DSRn < Data Set Ready 
+DCDn > Data Carrier Detect
+
+
+#set_io DCDn 1
+#set_io DSRn 2
+#set_io DTRn 3
+#set_io CTSn 4
+#set_io RTSn 7
+set_io RS232_Tx_TTL 8
+set_io RS232_Rx_TTL 9
--- a/FIRMWARE/perf.S
+++ b/FIRMWARE/perf.S
@@ -0,0 +1,19 @@
+.section .text
+.globl rdcycle
+.globl rdinstret
+
+rdcycle:
+.L0:  
+   rdcycleh a1
+   rdcycle a0
+   rdcycleh t0
+   bne a1,t0,.L0
+   ret
+
+rdinstret:
+.L1:  
+   rdinstreth a1
+   rdinstret a0
+   rdinstreth t0
+   bne a1,t0,.L1
+   ret
--- a/FIRMWARE/perf.h
+++ b/FIRMWARE/perf.h
@@ -0,0 +1,4 @@
+#include <stdint.h>
+
+extern uint64_t rdcycle();
+extern uint64_t rdinstret();
--- a/FIRMWARE/pi.c
+++ b/FIRMWARE/pi.c
@@ -0,0 +1,186 @@
+    /*
+     * Computation of the n'th decimal digit of \pi with very little memory.
+     * Written by Fabrice Bellard on January 8, 1997.
+     * 
+     * We use a slightly modified version of the method described by Simon
+     * Plouffe in "On the Computation of the n'th decimal digit of various
+     * transcendental numbers" (November 1996). We have modified the algorithm
+     * to get a running time of O(n^2) instead of O(n^3log(n)^3).
+     * 
+     * This program uses mostly integer arithmetic. It may be slow on some
+     * hardwares where integer multiplications and divisons must be done
+     * by software. We have supposed that 'int' has a size of 32 bits. If
+     * your compiler supports 'long long' integers of 64 bits, you may use
+     * the integer version of 'mul_mod' (see HAS_LONG_LONG).  
+     */
+
+     /* Adapted to FemtoRV32 (Bruno Levy Feb. 2021) */
+
+    #include <stdlib.h>
+    #include <stdio.h>
+    #include <math.h>
+//    #include "errno_fix.h"
+
+
+//#define RV32_FASTCODE __attribute((section(".fastcode")))
+#define RV32_FASTCODE
+
+/* uncomment the following line to use 'long long' integers */
+#define HAS_LONG_LONG 
+
+#ifdef HAS_LONG_LONG
+#define mul_mod(a,b,m) (( (long long) (a) * (long long) (b) ) % (m))
+#else
+#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
+#endif
+
+/* return the inverse of x mod y */
+int inv_mod(int x, int y) RV32_FASTCODE;
+int inv_mod(int x, int y)
+{
+    int q, u, v, a, c, t;
+
+    u = x;
+    v = y;
+    c = 1;
+    a = 0;
+    do {
+    q = v / u;
+
+    t = c;
+    c = a - q * c;
+    a = t;
+
+    t = u;
+    u = v - q * u;
+    v = t;
+    } while (u != 0);
+    a = a % y;
+    if (a < 0)
+    a = y + a;
+    return a;
+}
+
+/* return (a^b) mod m */
+int pow_mod(int a, int b, int m) RV32_FASTCODE;
+int pow_mod(int a, int b, int m)
+{
+    int r, aa;
+
+    r = 1;
+    aa = a;
+    while (1) {
+    if (b & 1)
+        r = mul_mod(r, aa, m);
+    b = b >> 1;
+    if (b == 0)
+        break;
+    aa = mul_mod(aa, aa, m);
+    }
+    return r;
+}
+
+/* return true if n is prime */
+int is_prime(int n) RV32_FASTCODE;
+int is_prime(int n)
+{
+    int r, i;
+    if ((n % 2) == 0)
+    return 0;
+
+    //r = (int) (sqrt(n));
+    //for (i = 3; i <= r; i += 2)
+    for (i = 3; i*i <= n; i += 2)
+    if ((n % i) == 0)
+        return 0;
+    return 1;
+}
+
+/* return the prime number immediatly after n */
+int next_prime(int n) RV32_FASTCODE;
+int next_prime(int n)
+{
+    do {
+    n++;
+    } while (!is_prime(n));
+    return n;
+}
+
+int digits(int n) RV32_FASTCODE;
+int digits(int n) {
+    int av, a, vmax, N, num, den, k, kq, kq2, t, v, s, i;
+    double sum;
+
+    N = (int) ((n + 20) * log(10) / log(2));
+
+    sum = 0;
+
+    for (a = 3; a <= (2 * N); a = next_prime(a)) {
+
+    vmax = (int) (log(2 * N) / log(a));
+    av = 1;
+    for (i = 0; i < vmax; i++)
+        av = av * a;
+
+    s = 0;
+    num = 1;
+    den = 1;
+    v = 0;
+    kq = 1;
+    kq2 = 1;
+
+    for (k = 1; k <= N; k++) {
+
+        t = k;
+        if (kq >= a) {
+        do {
+            t = t / a;
+            v--;
+        } while ((t % a) == 0);
+        kq = 0;
+        }
+        kq++;
+        num = mul_mod(num, t, av);
+
+        t = (2 * k - 1);
+        if (kq2 >= a) {
+        if (kq2 == a) {
+            do {
+            t = t / a;
+            v++;
+            } while ((t % a) == 0);
+        }
+        kq2 -= a;
+        }
+        den = mul_mod(den, t, av);
+        kq2 += 2;
+
+        if (v > 0) {
+        t = inv_mod(den, av);
+        t = mul_mod(t, num, av);
+        t = mul_mod(t, k, av);
+        for (i = v; i < vmax; i++)
+            t = mul_mod(t, a, av);
+        s += t;
+        if (s >= av)
+            s -= av;
+        }
+
+    }
+
+    t = pow_mod(10, n - 1, av);
+    s = mul_mod(s, t, av);
+       
+    sum = fmod(sum + (double) s / (double) av, 1.0);
+    }
+    return (int) (sum * 1e9);
+}
+
+
+void main() {
+    printf("\npi = 3.");
+    for(int n=1; ;n+=9) {
+       printf("%d",digits(n));
+       if(n > 36) break;
+    }
+}
--- a/FIRMWARE/pipeline.ld
+++ b/FIRMWARE/pipeline.ld
@@ -0,0 +1,29 @@
+MEMORY {
+   PROGROM (RX) : ORIGIN = 0x00000, LENGTH = 0x10000  /* 64kB ROM */
+   DATARAM (RW) : ORIGIN = 0x10000, LENGTH = 0x10000  /* 64kB RAM */   
+}
+
+SECTIONS {
+
+    .text : {
+        . = ALIGN(4);
+	start_pipeline.o (.text)
+        *(.text*)
+    } > PROGROM
+
+    .data : {
+	. = ALIGN(4);
+        *(.data*)          
+        *(.sdata*)
+        *(.rodata*) 
+        *(.srodata*)
+        *(.bss*)
+        *(.sbss*)
+	
+        *(COMMON)
+        *(.eh_frame)  
+        *(.eh_frame_hdr)
+        *(.init_array*)         
+        *(.gcc_except_table*)  
+    } > DATARAM
+}
--- a/FIRMWARE/print.c
+++ b/FIRMWARE/print.c
@@ -0,0 +1,65 @@
+
+#include <stdarg.h>
+
+void print_string(const char* s) {
+   for(const char* p = s; *p; ++p) {
+      putchar(*p);
+   }
+}
+
+int puts(const char* s) {
+   print_string(s);
+   putchar('\n');
+   return 1;
+}
+
+void print_dec(int val) {
+   char buffer[255];
+   char *p = buffer;
+   if(val < 0) {
+      putchar('-');
+      print_dec(-val);
+      return;
+   }
+   while (val || p == buffer) {
+      *(p++) = val % 10;
+      val = val / 10;
+   }
+   while (p != buffer) {
+      putchar('0' + *(--p));
+   }
+}
+
+void print_hex(unsigned int val) {
+   print_hex_digits(val, 8);
+}
+
+void print_hex_digits(unsigned int val, int nbdigits) {
+   for (int i = (4*nbdigits)-4; i >= 0; i -= 4) {
+      putchar("0123456789ABCDEF"[(val >> i) % 16]);
+   }
+}
+
+int printf(const char *fmt,...)
+{
+    va_list ap;
+
+    for(va_start(ap, fmt);*fmt;fmt++)
+    {
+        if(*fmt=='%')
+        {
+            fmt++;
+                 if(*fmt=='s') print_string(va_arg(ap,char *));
+            else if(*fmt=='x') print_hex(va_arg(ap,int));
+            else if(*fmt=='d') print_dec(va_arg(ap,int));
+            else if(*fmt=='c') putchar(va_arg(ap,int));	   
+            else putchar(*fmt);
+        }
+        else putchar(*fmt);
+    }
+
+    va_end(ap);
+
+    return 0;
+}
+
--- a/FIRMWARE/putchar.S
+++ b/FIRMWARE/putchar.S
@@ -0,0 +1,22 @@
+# Base address of memory-mapped IO,
+# Loaded into gp at startup
+.equ IO_BASE, 0x400000  
+
+# IO-reg offsets. To read or write one of them,
+# use IO_XXX(gp)
+.equ IO_LEDS, 4
+.equ IO_UART_DAT, 8
+.equ IO_UART_CNTL, 16
+
+.section .text
+.globl putchar
+
+putchar:
+   sw a0, IO_UART_DAT(gp)
+   li t0, 1<<9
+.L0:  
+   lw t1, IO_UART_CNTL(gp)
+   and t1, t1, t0
+   bnez t1, .L0
+  ret
+
--- a/FIRMWARE/raystones.c
+++ b/FIRMWARE/raystones.c
@@ -0,0 +1,518 @@
+/* A port of Dmitry Sokolov's tiny raytracer to C and to FemtoRV32 */
+/* Displays on the small OLED display and/or HDMI                  */
+/* Bruno Levy, 2020                                                */
+/* Original tinyraytracer: https://github.com/ssloy/tinyraytracer  */
+
+#include <stdint.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include "perf.h"
+#include "io.h"
+
+/*******************************************************************/
+
+typedef int BOOL;
+
+static inline float max(float x, float y) { return x>y?x:y; }
+static inline float min(float x, float y) { return x<y?x:y; }
+
+/*******************************************************************/
+
+// If you want to adapt tinyraytracer to your own platform, there are
+// mostly two macros and two functions to write:
+//   graphics_width
+//   graphics_height
+//   graphics_init()
+//   graphics_set_pixel()
+//
+// You can also write the following functions (or leave them empty if
+// you do not need them):
+//   graphics_terminate()
+//   stats_begin_frame()
+//   stats_begin_pixel()
+//   stats_end_pixel()
+//   stats_end_frame()
+
+
+// Size of the screen
+// Replace with your own variables or values
+
+// Benchmark
+// - graphics deactivated (else UART waiting loop gives
+//   different results according to CPU freq / UART baud rate
+//   ratio).
+// - smaller image size (for faster run in simulation)
+
+static int graphics_width  = 120;
+static int graphics_height = 60;
+
+static int bench_run=0;
+
+// Two pixels per character using UTF8 character set
+// (comment-out if terminal does not support it)
+#define graphics_double_lines
+
+// Replace with your own stuff to initialize graphics
+static inline void graphics_init() {
+    printf("\033[48;5;16m"   // set background color black
+	   "\033[38;5;15m"   // set foreground color white	   
+	   "\033[H"          // home
+           "\033[2J");       // clear screen
+}
+
+// Replace with your own stuff to terminate graphics or leave empty
+// Here I send <ctrl><D> to the UART, to exit the simulation in Verilator,
+// it is captured by special code in RTL/DEVICES/uart.v
+static inline void graphics_terminate() {
+    printf("\033[48;5;16m"   // set background color black
+	   "\033[38;5;15m"   // set foreground color white
+    );
+
+}
+
+// Replace with your own code.
+void graphics_set_pixel(int x, int y, float r, float g, float b) {
+   r = max(0.0f, min(1.0f, r));
+   g = max(0.0f, min(1.0f, g));
+   b = max(0.0f, min(1.0f, b));
+   uint8_t R = (uint8_t)(255.0f * r);
+   uint8_t G = (uint8_t)(255.0f * g);
+   uint8_t B = (uint8_t)(255.0f * b);
+   // graphics output deactivated for bench run
+   if(bench_run) {
+       if(y & 1) {
+	  if(x == graphics_width-1) {
+	     printf("%d",y/2);
+	  }
+       }
+       return;
+   } 
+#ifdef graphics_double_lines
+   static uint8_t prev_R=0;
+   static uint8_t prev_G=0;
+   static uint8_t prev_B=0;
+   if(y&1) {
+       if((R == prev_R) && (G == prev_G) && (B == prev_B)) {
+	   printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B);
+       } else {
+	   printf("\033[48;2;%d;%d;%dm",(int)prev_R,(int)prev_G,(int)prev_B);
+	   printf("\033[38;2;%d;%d;%dm",(int)R,(int)G,(int)B);
+	   // https://www.w3.org/TR/xml-entity-names/025.html
+	   // https://onlineunicodetools.com/convert-unicode-to-utf8
+	   printf("\xE2\x96\x83");
+       }
+       if(x == graphics_width-1) {
+	   printf("\033[38;2;0;0;0m");	   
+	   printf("\033[48;2;0;0;0m\n");
+       }
+   } else {
+       prev_R = R;
+       prev_G = G;
+       prev_B = B;
+   }
+#else   
+   printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B);
+   if(x == graphics_width-1) {
+       printf("\033[48;2;0;0;0m\n");
+   }
+#endif   
+}
+
+
+// Begins statistics collection for current pixel
+// Leave emtpy if not needed.
+// There are these two levels because on some
+// femtorv32 cores (quark, tachyon), the clock tick counter does not
+// have sufficient bits and will wrap during the time taken by
+// rendering a frame (up to several minutes).
+static inline stats_begin_pixel() {
+}
+
+// Ends statistics collection for current pixel
+// Leave emtpy if not needed.
+static inline stats_end_pixel() {
+}
+
+// Print "fixed point" number (integer/1000)
+static void printk(uint64_t kx) {
+    int intpart  = (int)(kx / 1000);
+    int fracpart = (int)(kx % 1000);
+    printf("%d.",intpart);
+    if(fracpart<100) {
+	printf("0");
+    }
+    if(fracpart<10) {
+	printf("0");
+    }
+    printf("%d",fracpart);
+}
+
+static uint64_t instret_start;
+static uint64_t cycles_start;
+
+// Begins statistics collection for current frame.
+// Leave emtpy if not needed.
+static inline stats_begin_frame() {
+    instret_start = rdinstret();
+    cycles_start  = rdcycle();
+}
+
+// Ends statistics collection for current frame
+// and displays result.
+// Leave emtpy if not needed.
+static inline stats_end_frame() {
+   graphics_terminate();
+   uint64_t instret = rdinstret() - instret_start;
+   uint64_t cycles = rdcycle()    - cycles_start ;
+   uint64_t kCPI       = cycles*1000/instret;
+   uint64_t pixels     = graphics_width * graphics_height;
+   uint64_t kRAYSTONES = (pixels*1000000000)/cycles;
+   printf(
+       "\n%dx%d      %s     ",
+       graphics_width,graphics_height,
+       bench_run ?
+           "no gfx output (measurement is accurate)" :
+           "gfx output (measurement is NOT accurate)"
+   );
+   printf("CPI="); printk(kCPI); printf("     ");
+   printf("RAYSTONES="); printk(kRAYSTONES);
+   printf("\n");
+}
+
+// Normally you will not need to modify anything beyond that point.
+/*******************************************************************/
+
+typedef struct { float x,y,z; }   vec3;
+typedef struct { float x,y,z,w; } vec4;
+
+static inline vec3 make_vec3(float x, float y, float z) {
+  vec3 V;
+  V.x = x; V.y = y; V.z = z;
+  return V;
+}
+
+static inline vec4 make_vec4(float x, float y, float z, float w) {
+  vec4 V;
+  V.x = x; V.y = y; V.z = z; V.w = w;
+  return V;
+}
+
+static inline vec3 vec3_neg(vec3 V) {
+  return make_vec3(-V.x, -V.y, -V.z);
+}
+
+static inline vec3 vec3_add(vec3 U, vec3 V) {
+  return make_vec3(U.x+V.x, U.y+V.y, U.z+V.z);
+}
+
+static inline vec3 vec3_sub(vec3 U, vec3 V) {
+  return make_vec3(U.x-V.x, U.y-V.y, U.z-V.z);
+}
+
+static inline float vec3_dot(vec3 U, vec3 V) {
+  return U.x*V.x+U.y*V.y+U.z*V.z;
+}
+
+static inline vec3 vec3_scale(float s, vec3 U) {
+  return make_vec3(s*U.x, s*U.y, s*U.z);
+}
+
+static inline float vec3_length(vec3 U) {
+  return sqrtf(U.x*U.x+U.y*U.y+U.z*U.z);
+}
+
+static inline vec3 vec3_normalize(vec3 U) {
+  return vec3_scale(1.0f/vec3_length(U),U);
+}
+
+/*************************************************************************/
+
+typedef struct Light {
+    vec3 position;
+    float intensity;
+} Light;
+
+Light make_Light(vec3 position, float intensity) {
+  Light L;
+  L.position = position;
+  L.intensity = intensity;
+  return L;
+}
+
+/*************************************************************************/
+
+typedef struct {
+    float refractive_index;
+    vec4  albedo;
+    vec3  diffuse_color;
+    float specular_exponent;
+} Material;
+
+Material make_Material(float r, vec4 a, vec3 color, float spec) {
+  Material M;
+  M.refractive_index = r;
+  M.albedo = a;
+  M.diffuse_color = color;
+  M.specular_exponent = spec;
+  return M;
+}
+
+Material make_Material_default() {
+  Material M;
+  M.refractive_index = 1;
+  M.albedo = make_vec4(1,0,0,0);
+  M.diffuse_color = make_vec3(0,0,0);
+  M.specular_exponent = 0;
+  return M;
+}
+
+/*************************************************************************/
+
+typedef struct {
+  vec3 center;
+  float radius;
+  Material material;
+} Sphere;
+
+Sphere make_Sphere(vec3 c, float r, Material M) {
+  Sphere S;
+  S.center = c;
+  S.radius = r;
+  S.material = M;
+  return S;
+}
+
+BOOL Sphere_ray_intersect(Sphere* S, vec3 orig, vec3 dir, float* t0) {
+  vec3 L = vec3_sub(S->center, orig);
+  float tca = vec3_dot(L,dir);
+  float d2 = vec3_dot(L,L) - tca*tca;
+  float r2 = S->radius*S->radius;
+  if (d2 > r2) return 0;
+  float thc = sqrtf(r2 - d2);
+  *t0       = tca - thc;
+  float t1 = tca + thc;
+  if (*t0 < 0) *t0 = t1;
+  if (*t0 < 0) return 0;
+  return 1;
+}
+
+vec3 reflect(vec3 I, vec3 N) {
+  return vec3_sub(I, vec3_scale(2.f*vec3_dot(I,N),N));
+}
+
+vec3 refract(vec3 I, vec3 N, float eta_t, float eta_i /* =1.f */) {
+  // Snell's law
+  float cosi = -max(-1.f, min(1.f, vec3_dot(I,N)));
+  // if the ray comes from the inside the object, swap the air and the media  
+  if (cosi<0) return refract(I, vec3_neg(N), eta_i, eta_t); 
+    float eta = eta_i / eta_t;
+    float k = 1 - eta*eta*(1 - cosi*cosi);
+    // k<0 = total reflection, no ray to refract.
+    // I refract it anyways, this has no physical meaning
+    return k<0 ? make_vec3(1,0,0)
+              : vec3_add(vec3_scale(eta,I),vec3_scale((eta*cosi - sqrtf(k)),N));
+}
+
+BOOL scene_intersect(
+   vec3 orig, vec3 dir, Sphere* spheres, int nb_spheres,
+   vec3* hit, vec3* N, Material* material
+) {
+  float spheres_dist = 1e30;
+  for(int i=0; i<nb_spheres; ++i) {
+    float dist_i;
+    if(
+       Sphere_ray_intersect(&spheres[i], orig, dir, &dist_i) &&
+       (dist_i < spheres_dist)
+    ) {
+      spheres_dist = dist_i;
+      *hit = vec3_add(orig,vec3_scale(dist_i,dir));
+      *N = vec3_normalize(vec3_sub(*hit, spheres[i].center));
+      *material = spheres[i].material;
+    }
+  }
+  float checkerboard_dist = 1e30;
+  if (fabs(dir.y)>1e-3)  {
+    float d = -(orig.y+4)/dir.y; // the checkerboard plane has equation y = -4
+    vec3 pt = vec3_add(orig, vec3_scale(d,dir));
+    if (d>0 && fabs(pt.x)<10 && pt.z<-10 && pt.z>-30 && d<spheres_dist) {
+      checkerboard_dist = d;
+      *hit = pt;
+      *N = make_vec3(0,1,0);
+      material->diffuse_color =
+	(((int)(.5*hit->x+1000) + (int)(.5*hit->z)) & 1)
+	             ? make_vec3(.3, .3, .3)
+	             : make_vec3(.3, .2, .1);
+    }
+  }
+  return min(spheres_dist, checkerboard_dist)<1000;
+}
+
+vec3 cast_ray(
+   vec3 orig, vec3 dir, Sphere* spheres, int nb_spheres,
+   Light* lights, int nb_lights, int depth /* =0 */
+) {
+  vec3 point,N;
+  Material material = make_Material_default();
+  if (
+    depth>2 ||
+    !scene_intersect(orig, dir, spheres, nb_spheres, &point, &N, &material)
+  ) {
+    float s = 0.5*(dir.y + 1.0);
+    return vec3_add(
+	vec3_scale(s,make_vec3(0.2, 0.7, 0.8)),
+        vec3_scale(s,make_vec3(0.0, 0.0, 0.5))
+    );
+  }
+
+  vec3 reflect_dir=vec3_normalize(reflect(dir, N));
+  vec3 refract_dir=vec3_normalize(refract(dir,N,material.refractive_index,1));
+  
+  // offset the original point to avoid occlusion by the object itself 
+  vec3 reflect_orig =
+    vec3_dot(reflect_dir,N) < 0
+               ? vec3_sub(point,vec3_scale(1e-3,N))
+               : vec3_add(point,vec3_scale(1e-3,N)); 
+  vec3 refract_orig =
+    vec3_dot(refract_dir,N) < 0
+               ? vec3_sub(point,vec3_scale(1e-3,N))
+               : vec3_add(point,vec3_scale(1e-3,N));
+  vec3 reflect_color = cast_ray(
+       reflect_orig, reflect_dir, spheres, nb_spheres,
+       lights, nb_lights, depth + 1
+  );
+  vec3 refract_color = cast_ray(
+       refract_orig, refract_dir, spheres, nb_spheres,
+       lights, nb_lights, depth + 1
+  );
+  
+  float diffuse_light_intensity = 0, specular_light_intensity = 0;
+  for (int i=0; i<nb_lights; i++) {
+    vec3  light_dir = vec3_normalize(vec3_sub(lights[i].position,point));
+    float light_distance = vec3_length(vec3_sub(lights[i].position,point));
+
+    vec3 shadow_orig =
+      vec3_dot(light_dir,N) < 0
+                ? vec3_sub(point,vec3_scale(1e-3,N))
+                : vec3_add(point,vec3_scale(1e-3,N)) ;
+    // checking if the point lies in the shadow of the lights[i]
+    vec3 shadow_pt, shadow_N;
+    Material tmpmaterial;
+    if (
+       scene_intersect(
+	 shadow_orig, light_dir, spheres, nb_spheres,
+	 &shadow_pt, &shadow_N, &tmpmaterial
+       ) && (
+  	 vec3_length(vec3_sub(shadow_pt,shadow_orig)) < light_distance
+	     )
+    ) continue ;
+    
+    diffuse_light_intensity  +=
+                  lights[i].intensity * max(0.f, vec3_dot(light_dir,N));
+     
+    float abc = max(
+	           0.f, vec3_dot(vec3_neg(reflect(vec3_neg(light_dir), N)),dir)
+	        );
+    float def = material.specular_exponent;
+    if(abc > 0.0f && def > 0.0f) {
+      specular_light_intensity += powf(abc,def)*lights[i].intensity;
+    }
+  }
+  vec3 result = vec3_scale(
+      diffuse_light_intensity * material.albedo.x, material.diffuse_color
+  );
+  result = vec3_add(
+       result, vec3_scale(specular_light_intensity * material.albedo.y,
+       make_vec3(1,1,1))
+  );
+  result = vec3_add(result, vec3_scale(material.albedo.z, reflect_color));
+  result = vec3_add(result, vec3_scale(material.albedo.w, refract_color));
+  return result;
+}
+
+static inline void render_pixel(
+    int i, int j, Sphere* spheres, int nb_spheres, Light* lights, int nb_lights
+) {
+   const float fov  = M_PI/3.;
+   stats_begin_pixel();
+   float dir_x =  (i + 0.5) - graphics_width/2.;
+   float dir_y = -(j + 0.5) + graphics_height/2.; // this flips the image.
+   float dir_z = -graphics_height/(2.*tan(fov/2.));
+   vec3 C = cast_ray(
+       make_vec3(0,0,0), vec3_normalize(make_vec3(dir_x, dir_y, dir_z)),
+       spheres, nb_spheres, lights, nb_lights, 0
+   );
+   graphics_set_pixel(i,j,C.x,C.y,C.z);
+   stats_end_pixel();
+}
+
+void render(Sphere* spheres, int nb_spheres, Light* lights, int nb_lights) {
+   stats_begin_frame();
+#ifdef graphics_double_lines  
+   for (int j = 0; j<graphics_height; j+=2) { 
+      for (int i = 0; i<graphics_width; i++) {
+	  render_pixel(i,j  ,spheres,nb_spheres,lights,nb_lights);
+	  render_pixel(i,j+1,spheres,nb_spheres,lights,nb_lights);	  
+      }
+   }
+#else
+   for (int j = 0; j<graphics_height; j++) { 
+      for (int i = 0; i<graphics_width; i++) {
+	  render_pixel(i,j  ,spheres,nb_spheres,lights,nb_lights);
+      }
+   }
+#endif
+   stats_end_frame();
+}
+
+int nb_spheres = 4;
+Sphere spheres[4];
+
+int nb_lights = 3;
+Light lights[3];
+
+void init_scene() {
+    Material ivory = make_Material(
+       1.0, make_vec4(0.6,  0.3, 0.1, 0.0), make_vec3(0.4, 0.4, 0.3),   50.
+    );
+    Material glass = make_Material(
+       1.5, make_vec4(0.0,  0.5, 0.1, 0.8), make_vec3(0.6, 0.7, 0.8),  125.
+    );
+    Material red_rubber = make_Material(
+       1.0, make_vec4(0.9,  0.1, 0.0, 0.0), make_vec3(0.3, 0.1, 0.1),   10.
+    );
+    Material mirror = make_Material(
+       1.0, make_vec4(0.0, 10.0, 0.8, 0.0), make_vec3(1.0, 1.0, 1.0),  142.
+    );
+
+    spheres[0] = make_Sphere(make_vec3(-3,    0,   -16), 2,      ivory);
+    spheres[1] = make_Sphere(make_vec3(-1.0, -1.5, -12), 2,      glass);
+    spheres[2] = make_Sphere(make_vec3( 1.5, -0.5, -18), 3, red_rubber);
+    spheres[3] = make_Sphere(make_vec3( 7,    5,   -18), 4,     mirror);
+
+    lights[0] = make_Light(make_vec3(-20, 20,  20), 1.5);
+    lights[1] = make_Light(make_vec3( 30, 50, -25), 1.8);
+    lights[2] = make_Light(make_vec3( 30, 20,  30), 1.7);
+}
+
+int main() {
+    init_scene();
+
+    graphics_init();
+    IO_OUT(IO_LEDS,5);
+    bench_run = 1;
+    graphics_width  = 40;
+    graphics_height = 20;
+    printf("Running without graphic output (for accurate measurement)...\n");
+    render(spheres, nb_spheres, lights, nb_lights);
+    IO_OUT(IO_LEDS,10);
+
+    bench_run = 0;
+    graphics_width = 120;
+    graphics_height = 60;
+    render(spheres, nb_spheres, lights, nb_lights);
+    IO_OUT(IO_LEDS,15);
+    graphics_terminate();
+    
+    return 0;
+}
--- a/FIRMWARE/read_spiflash.c
+++ b/FIRMWARE/read_spiflash.c
@@ -0,0 +1,14 @@
+#include "io.h"
+
+#define SPI_FLASH_BASE ((char*)(1 << 23))
+
+int main()  {
+   for(int i=0; i<16; ++i) {
+      IO_OUT(IO_LEDS,i);
+      int lo = (int)SPI_FLASH_BASE[2*i  ];
+      int hi = (int)SPI_FLASH_BASE[2*i+1];
+      print_hex_digits((hi << 8) | lo,4); // print four hexadecimal digits
+      printf(" ");
+   }
+   printf("\n");
+}
--- a/FIRMWARE/riscv_logo.c
+++ b/FIRMWARE/riscv_logo.c
@@ -0,0 +1,135 @@
+/* 
+ * FEMTORV32 - FEMTOSOC demo program:
+ * Displaying a rotating RISCV logo
+ */ 
+
+#include <stdio.h>
+
+#ifdef __linux__
+#include <unistd.h>
+#endif
+
+
+/* The RISCV logo, with a tiny resolution
+ * (remember, I only got 4Kb of RAM
+ * on the IceStick !) 
+ */
+unsigned char logo[16][16] = {
+   {7,7,7,7,7,7,5,3,3,3,3,3,3,3,3,7},
+   {7,7,7,7,7,7,7,5,3,3,3,3,3,3,3,7},
+   {1,1,1,1,2,7,7,7,3,3,3,3,3,3,3,7},
+   {0,0,0,0,0,1,7,7,5,3,3,3,3,3,3,7},
+   {0,0,0,0,0,0,7,7,6,3,3,3,3,3,6,7},
+   {0,0,0,0,0,0,7,7,5,3,3,3,3,4,7,7},
+   {0,0,0,0,0,2,7,7,4,3,3,3,3,7,7,7},
+   {0,2,2,2,7,7,7,6,3,3,3,3,6,7,7,7},
+   {0,7,7,7,7,7,6,3,3,3,3,5,7,7,2,7},
+   {0,1,7,7,7,4,3,3,3,3,3,7,7,7,0,7},
+   {0,0,2,7,7,6,3,3,3,3,6,7,7,1,0,7},
+   {0,0,0,2,7,7,5,3,3,5,7,7,2,0,0,7},
+   {0,0,0,0,7,7,7,5,4,7,7,2,0,0,0,7},
+   {0,0,0,0,0,7,7,7,7,7,7,0,0,0,0,7},
+   {0,0,0,0,0,1,7,7,7,7,1,0,0,0,0,7},
+   {7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7}
+};
+
+
+/* 
+ * ANSI color codes:
+ * https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences
+ */
+
+
+#define ANSIRGB(R,G,B) "\033[48;2;" #R ";"  #G ";" #B "m  "
+
+#define ANSICOL(C) "\033[" #C "m  "
+
+
+/* 
+ * The colormap.
+ */
+
+/*
+ * This ones corresponds to the official RISC-V logo,
+ * but uses more bandwidth (full RGB ANSI codes)
+ */
+/*
+const char* cmap[8] = {
+   ANSIRGB(040,051,116),
+   ANSIRGB(123,128,155),
+   ANSIRGB(170,172,188),
+   ANSIRGB(249,177,021),
+   ANSIRGB(249,190,101),
+   ANSIRGB(249,199,130),
+   ANSIRGB(252,216,176),
+   ANSIRGB(250,251,248)
+};
+*/
+
+/* more compact colormap */
+const char* cmap[8] = {
+   ANSICOL(44),
+   ANSICOL(104),
+   ANSICOL(47),
+   ANSICOL(102),
+   ANSICOL(103),
+   ANSICOL(103),
+   ANSICOL(103),
+   ANSICOL(107)
+};
+
+/*
+ * Generated by TOOLS/make_sintab.c
+ */
+
+int sintab[64] = {
+   0,25,49,74,97,120,142,162,181,197,212,225,236,244,251,254,
+   256,254,251,244,236,225,212,197,181,162,142,120,97,74,49,25,
+   0,-25,-49,-74,-97,-120,-142,-162,-181,-197,-212,-225,-236,-244,
+   -251,-254,-256,-254,-251,-244,-236,-225,-212,-197,-181,-162,
+   -142,-120,-97,-74,-49,-25
+};
+
+
+#define GL_width  40
+#define GL_height 40
+
+
+void main() {
+
+    int frame = 0;
+    int last_col = -1;
+    for(;;) {
+        printf("\033[H"); // reset cursor position
+
+        int scaling = (sintab[frame&63]+400) << 1;
+        int Ux = scaling*sintab[frame & 63];         
+        int Uy = scaling*sintab[(frame + 16) & 63];  
+        int Vx = -Uy;                                
+        int Vy =  Ux;                                
+
+        int X0 = -(GL_width/2)*(Ux+Vx);
+        int Y0 = -(GL_height/2)*(Uy+Vy);
+
+	for(int y=0; y<GL_height; ++y) {
+	    int X = X0;
+	    int Y = Y0;
+	    for(int x=0; x<GL_width; ++x) {
+	        unsigned char col = logo[(Y >> 18)&15][(X >> 18)&15];
+	        printf(col == last_col ? "  " : cmap[col]);
+	        last_col = col;
+	        X += Ux;
+	        Y += Uy;
+	    }
+	    printf("\033[49m\n"); // reset color to black and newline
+	    last_col = -1;
+	    X0 += Vx;
+	    Y0 += Vy;
+	}
+        ++frame;
+#ifdef __linux__       
+        usleep(20000);
+#endif
+//       if(frame > 20) break;
+    }
+}
--- a/FIRMWARE/sieve.c
+++ b/FIRMWARE/sieve.c
@@ -0,0 +1,106 @@
+// Taken from picorv32
+// 
+// This is free and unencumbered software released into the public domain.
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+
+// A simple Sieve of Eratosthenes
+
+#include <stdio.h>
+#include <stdint.h>
+
+/*************************************************************************/
+
+// Note: if this is changed, then checksum need 
+// to be updated as well.
+#define BITMAP_SIZE 64
+
+typedef int bool;
+
+static uint32_t bitmap[BITMAP_SIZE/32];
+
+static uint32_t hash;
+
+static uint32_t mkhash(uint32_t a, uint32_t b)
+{
+	// The XOR version of DJB2
+	return ((a << 5) + a) ^ b;
+}
+
+static void bitmap_set(int idx)
+{
+   bitmap[idx/32] |= 1 << (idx % 32);
+}
+
+static bool bitmap_get(int idx)
+{
+   return (bitmap[idx/32] & (1 << (idx % 32))) != 0;
+}
+
+static void print_prime(int idx, int val)
+{
+	if (idx < 10)
+		printf(" ");
+	printf("%d",idx);
+
+	if (idx / 10 == 1)
+		goto force_th;
+	switch (idx % 10) {
+		case 1: printf("st"); break;
+		case 2: printf("nd"); break;
+		case 3: printf("rd"); break;
+	force_th:
+		default: printf("th"); break;
+	}
+	printf(" prime: %d\n",val);
+
+	hash = mkhash(hash, idx);
+	hash = mkhash(hash, val);
+}
+
+void sieve(void)
+{
+
+	int idx = 1;
+	hash = 5381;
+	print_prime(idx++, 2);
+	for (int i = 0; i < BITMAP_SIZE; i++) {
+		if (bitmap_get(i))
+			continue;
+		print_prime(idx++, 3+2*i);
+		for (int j = 2*(3+2*i);; j += 3+2*i) {
+			if (j%2 == 0)
+				continue;
+			int k = (j-3)/2;
+			if (k >= BITMAP_SIZE)
+				break;
+			bitmap_set(k);
+		}
+	}
+
+	printf("checksum:\n   %x",hash);
+
+	if (hash == 0x1772A48F) {
+	        printf(" OK\n");
+	} else {
+		printf(" ERROR\n");
+	}
+}
+
+int main(void)
+{
+
+        for(;;) {
+	   sieve();
+	   for(int i=0; i<10; ++i) {
+	      wait();
+	   }
+	}
+   
+   
+        return 0;
+}
+
--- a/FIRMWARE/spiflash0.ld
+++ b/FIRMWARE/spiflash0.ld
@@ -0,0 +1,10 @@
+MEMORY {
+   FLASH (RX)  : ORIGIN = 0x00820000, LENGTH = 0x100000 /* 4 MB in flash */
+}
+SECTIONS {
+    everything : {
+	. = ALIGN(4);
+	start.o (.text)
+        *(.*)
+    } >FLASH
+}
--- a/FIRMWARE/spiflash1.ld
+++ b/FIRMWARE/spiflash1.ld
@@ -0,0 +1,60 @@
+/* Linker script for programs stored in SPI flash */
+/* Inspired from picorv32/picosoc/sections.lds       */
+/*                                                */
+/* text and rodata sections are sent to flash     */
+/* bss sections are sent to BRAM                  */
+/* data sections are sent to BRAM and have        */
+/*  initialization data in flash.                 */
+/* AT keyword specifies LMA (Load Memory Address) */
+
+MEMORY {
+    FLASH (rx)  : ORIGIN = 0x00820000, LENGTH = 0x100000    /* 4 MB in flash */
+    RAM   (rwx) : ORIGIN = 0x00000000, LENGTH = 0x1800      /* 6 kB in RAM   */ 
+}
+
+SECTIONS {
+
+    /* 
+     * This is the initialized data and fastcode section
+     * The program executes knowing that the data is in the RAM
+     * but the loader puts the initial values in the FLASH (inidata).
+     * It is one task of the startup (crt0_spiflash.S) to copy the initial values from FLASH to RAM. 
+     */
+    .data : AT ( _sidata ) {
+    
+        . = ALIGN(4); 
+	
+        _sdata = .;        /* create a global symbol at data start; used by startup code in order to initialise the .data section in RAM */
+        _ram_start = .;    /* create a global symbol at ram start (e.g., for garbage collector) */
+	
+	/* Initialized data */
+        *(.data*)          
+        *(.sdata*)
+
+        . = ALIGN(4);
+        _edata = .;        /* define a global symbol at data end; used by startup code in order to initialise the .data section in RAM */
+    } > RAM
+
+    /* The (non fastcode) program code and other data goes into FLASH */
+    .text : {
+        . = ALIGN(4);
+        start_spiflash1.o(.text)  /* c runtime initialization (code) */
+        *(.text*)                 /* .text* sections (code) */
+        . = ALIGN(4);
+        *(.rodata*)              /* .rodata* sections (constants, strings, etc.) */
+        *(.srodata*)             /* .rodata* sections (constants, strings, etc.) */
+        _etext = .;              /* define a global symbol at end of code */
+        _sidata = _etext;        /* This is used by the startup in order to initialize the .data section */
+    } >FLASH
+
+    /* Uninitialized data section */
+    .bss : {
+        . = ALIGN(4);
+        _sbss = .;         /* define a global symbol at bss start; used by startup code */
+        *(.bss*)
+        *(.sbss*)
+        *(COMMON)
+        . = ALIGN(4);
+        _ebss = .;         /* define a global symbol at bss end; used by startup code */
+    } >RAM
+}
--- a/FIRMWARE/spiflash2.ld
+++ b/FIRMWARE/spiflash2.ld
@@ -0,0 +1,80 @@
+/* Linker script for programs stored in SPI flash */
+/* Inspired from picorv32/picosoc/sections.lds       */
+/*                                                */
+/* text and rodata sections are sent to flash     */
+/* bss sections are sent to BRAM                  */
+/* data sections are sent to BRAM and have        */
+/*  initialization data in flash.                 */
+/* AT keyword specifies LMA (Load Memory Address) */
+
+MEMORY {
+    FLASH (rx)  : ORIGIN = 0x00820000, LENGTH = 0x100000    /* 4 MB in flash */
+    RAM   (rwx) : ORIGIN = 0x00000000, LENGTH = 0x1800      /* 6 kB in RAM   */ 
+}
+
+SECTIONS {
+
+
+    /* 
+     * This is the initialized data and fastcode section
+     * The program executes knowing that the data is in the RAM
+     * but the loader puts the initial values in the FLASH (inidata).
+     * It is one task of the startup (crt0_spiflash.S) to copy the initial values from FLASH to RAM. 
+     */
+    .data_and_fastcode : AT ( _sidata ) {
+        . = ALIGN(4);
+        _sdata = .;        /* create a global symbol at data start; used by startup code in order to initialise the .data section in RAM */
+        _ram_start = .;    /* create a global symbol at ram start (e.g., for garbage collector) */
+	
+	/* Initialized data */
+        *(.data*)          
+        *(.sdata*)
+
+	/* integer mul and div */
+	*/libgcc.a:muldi3.o(.text)
+	*/libgcc.a:div.o(.text)    
+
+	putchar.o(.text)
+	print.o(.text)	
+
+	/* functions with attribute((section(".fastcode"))) */	
+	*(.fastcode*)      
+
+        . = ALIGN(4);
+        _edata = .;        /* define a global symbol at data end; used by startup code in order to initialise the .data section in RAM */
+    } > RAM
+
+    /* The (non fastcode) program code and other data goes into FLASH */
+    .text : {
+        . = ALIGN(4);
+        start_spiflash1.o(.text)  /* c runtime initialization (code) */
+
+        /*
+         * I do not understand why, but if I do not put this section, I got
+         * an overlapping sections error with some programs (for instance pi.c
+         * or C++ programs)
+         */
+        *(.eh_frame)  
+        *(.eh_frame_hdr)   
+        *(.init_array)         
+        *(.gcc_except_table*)  
+	
+        *(.text*)                 /* .text* sections (code) */
+        . = ALIGN(4);
+        *(.rodata*)              /* .rodata* sections (constants, strings, etc.) */
+        *(.srodata*)             /* .rodata* sections (constants, strings, etc.) */
+        _etext = .;              /* define a global symbol at end of code */
+        _sidata = _etext;        /* This is used by the startup in order to initialize the .data section */
+    } >FLASH
+
+    /* Uninitialized data section */
+    .bss : {
+        . = ALIGN(4);
+        _sbss = .;         /* define a global symbol at bss start; used by startup code */
+        *(.bss*)
+        *(.sbss*)
+        *(COMMON)
+        . = ALIGN(4);
+        _ebss = .;         /* define a global symbol at bss end; used by startup code */
+    } >RAM
+}
--- a/FIRMWARE/spiflash3.ld
+++ b/FIRMWARE/spiflash3.ld
@@ -0,0 +1,87 @@
+/* Linker script for programs stored in SPI flash */
+/* Inspired from picorv32/picosoc/sections.lds       */
+/*                                                */
+/* text and rodata sections are sent to flash     */
+/* bss sections are sent to BRAM                  */
+/* data sections are sent to BRAM and have        */
+/*  initialization data in flash.                 */
+/* AT keyword specifies LMA (Load Memory Address) */
+
+MEMORY {
+    FLASH (rx)  : ORIGIN = 0x00820000, LENGTH = 0x100000    /* 4 MB in flash */
+    RAM   (rwx) : ORIGIN = 0x00000000, LENGTH = 0x1800      /* 6 kB in RAM   */ 
+}
+
+SECTIONS {
+
+
+    /* 
+     * This is the initialized data and fastcode section
+     * The program executes knowing that the data is in the RAM
+     * but the loader puts the initial values in the FLASH (inidata).
+     * It is one task of the startup (crt0_spiflash.S) to copy the initial values from FLASH to RAM. 
+     */
+    .data_and_fastcode : AT ( _sidata ) {
+        . = ALIGN(4);
+        _sdata = .;        /* create a global symbol at data start; used by startup code in order to initialise the .data section in RAM */
+        _ram_start = .;    /* create a global symbol at ram start (e.g., for garbage collector) */
+	
+	/* Initialized data */
+        *(.data*)          
+        *(.sdata*)
+
+	/* integer mul and div */
+	*/libgcc.a:muldi3.o(.text)
+	*/libgcc.a:div.o(.text)    
+
+	/* putchar.o(.text) */
+
+	/* functions with attribute((section(".fastcode"))) */	
+	*(.fastcode*)      
+
+        . = ALIGN(4);
+        _edata = .;        /* define a global symbol at data end; used by startup code in order to initialise the .data section in RAM */
+    } > RAM
+
+    /* The (non fastcode) program code and other data goes into FLASH */
+    .text : {
+        . = ALIGN(4);
+        start_spiflash1.o(.text)  /* c runtime initialization (code) */
+
+        /*
+         * I do not understand why, but if I do not put this section, I got
+         * an overlapping sections error with some programs (for instance pi.c
+         * or C++ programs)
+         */
+        *(.eh_frame)  
+        *(.eh_frame_hdr)
+        *(.init_array*)         
+        *(.gcc_except_table*)  
+	
+        *(.text*)                 /* .text* sections (code) */
+        . = ALIGN(4);
+        *(.rodata*)              /* .rodata* sections (constants, strings, etc.) */
+        *(.srodata*)             /* .rodata* sections (constants, strings, etc.) */
+        _etext = .;              /* define a global symbol at end of code */
+        _sidata = _etext;        /* This is used by the startup in order to initialize the .data section */
+    } >FLASH
+
+    /* Uninitialized data section */
+    .bss : {
+        . = ALIGN(4);
+        _sbss = .;         /* define a global symbol at bss start; used by startup code */
+        *(.bss*)
+        *(.sbss*)
+        *(COMMON)
+        . = ALIGN(4);
+        _ebss = .;         /* define a global symbol at bss end; used by startup code */
+    } >RAM
+    
+    /* this is to define the start of the heap, and make sure we have a minimum size */
+    .heap : {
+        . = ALIGN(4);
+        _heap_start = .;    /* define a global symbol at heap start */
+	_end = .;           /* as expected by syscalls.c            */
+    } >RAM
+
+}
--- a/FIRMWARE/start.S
+++ b/FIRMWARE/start.S
@@ -0,0 +1,9 @@
+.equ IO_BASE, 0x400000  
+.section .text
+.globl start
+start:
+        li   gp,IO_BASE
+	li   sp,0x1800
+	call main
+	ebreak
+	
--- a/FIRMWARE/start_pipeline.S
+++ b/FIRMWARE/start_pipeline.S
@@ -0,0 +1,9 @@
+.equ IO_BASE, 0x400000  
+.section .text
+.globl start
+start:
+        li   gp,IO_BASE
+	li   sp,0x20000
+	call main
+	ebreak
+	
--- a/FIRMWARE/start_spiflash1.S
+++ b/FIRMWARE/start_spiflash1.S
@@ -0,0 +1,43 @@
+.equ IO_BASE, 0x400000  
+
+.text
+.global _start
+.type _start, @function
+
+_start:
+.option push
+.option norelax
+     li  gp,IO_BASE
+.option pop
+
+     li   sp,0x1800
+
+# zero-init bss section:
+# clears from _sbss to _ebss
+# _sbss and _ebss are defined by linker script (spiflash.ld)
+     la a0, _sbss
+     la a1, _ebss
+     bge a0, a1, end_init_bss
+loop_init_bss:
+     sw zero, 0(a0)
+     addi a0, a0, 4
+     blt a0, a1, loop_init_bss
+end_init_bss:
+
+# copy data section from SPI Flash to BRAM:
+# copies from _sidata (in flash) to _sdata ... _edata (in BRAM)
+# _sidata, _sdata and _edata are defined by linker script (spiflash.ld)
+     la a0, _sidata
+     la a1, _sdata
+     la a2, _edata
+     bge a1, a2, end_init_data
+loop_init_data:
+     lw a3, 0(a0)
+     sw a3, 0(a1)
+     addi a0, a0, 4
+     addi a1, a1, 4
+     blt a1, a2, loop_init_data
+end_init_data:
+
+     call main
+     ebreak
--- a/FIRMWARE/test_rdcycle.c
+++ b/FIRMWARE/test_rdcycle.c
@@ -0,0 +1,13 @@
+#include "perf.h"
+
+int main() {
+   for(int i=0; i<100; ++i) {
+      uint64_t cycles = rdcycle();
+      uint64_t instret = rdinstret();      
+      printf("i=%d    cycles=%d     instret=%d\n", i, (int)cycles, (int)instret);
+   }
+   uint64_t instret = rdinstret();      
+   uint64_t cycles = rdcycle();
+   printf("cycles=%d     instret=%d    100CPI=%d\n", (int)cycles, (int)instret, (int)(100*cycles/instret));
+   
+}
--- a/FIRMWARE/test_spi_flash.c
+++ b/FIRMWARE/test_spi_flash.c
@@ -0,0 +1,17 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#define SPI_FLASH_BASE ((uint32_t*)(1 << 23))
+
+int main() {
+   for(;;) {
+      for(int i=0; i<40; ++i) {
+	 uint32_t word = SPI_FLASH_BASE[i];
+	 char* c = (char*)&word;
+	 printf("%d 0x%x %c%c%c%c\n", i, word, c[0],c[1],c[2],c[3]);
+      }
+      printf("\n");
+      printf("\n");      
+   }
+   
+}
--- a/FIRMWARE/tinyraytracer.c
+++ b/FIRMWARE/tinyraytracer.c
@@ -0,0 +1,444 @@
+/* A port of Dmitry Sokolov's tiny raytracer to C and to FemtoRV32 */
+/* Displays on the small OLED display and/or HDMI                  */
+/* Bruno Levy, 2020                                                */
+/* Original tinyraytracer: https://github.com/ssloy/tinyraytracer  */
+
+#include <stdint.h>
+#include <math.h>
+#include <stdlib.h>
+
+/*******************************************************************/
+
+typedef int BOOL;
+
+static inline float max(float x, float y) { return x>y?x:y; }
+static inline float min(float x, float y) { return x<y?x:y; }
+
+/*******************************************************************/
+
+// If you want to adapt tinyraytracer to your own platform, there are
+// mostly two macros and two functions to write:
+//   graphics_width
+//   graphics_height
+//   graphics_init()
+//   graphics_set_pixel()
+//
+// You can also write the following functions (or leave them empty if
+// you do not need them):
+//   graphics_terminate()
+//   stats_begin_frame()
+//   stats_begin_pixel()
+//   stats_end_pixel()
+//   stats_end_frame()
+
+
+// Size of the screen
+// Replace with your own variables or values
+#define graphics_width  120
+#define graphics_height 60
+
+// Two pixels per character using UTF8 character set
+// (comment-out if terminal does not support it)
+#define graphics_double_lines
+
+// Replace with your own stuff to initialize graphics
+static inline void graphics_init() {
+    printf("\033[48;5;16m"   // set background color black
+	   "\033[H"          // home
+           "\033[2J");       // clear screen
+}
+
+// Replace with your own stuff to terminate graphics or leave empty
+// Here I send <ctrl><D> to the UART, to exit the simulation in Verilator,
+// it is captured by special code in RTL/DEVICES/uart.v
+static inline void graphics_terminate() {
+}
+
+
+// Replace with your own code.
+void graphics_set_pixel(int x, int y, float r, float g, float b) {
+   r = max(0.0f, min(1.0f, r));
+   g = max(0.0f, min(1.0f, g));
+   b = max(0.0f, min(1.0f, b));
+   uint8_t R = (uint8_t)(255.0f * r);
+   uint8_t G = (uint8_t)(255.0f * g);
+   uint8_t B = (uint8_t)(255.0f * b);
+#ifdef graphics_double_lines
+   static uint8_t prev_R=0;
+   static uint8_t prev_G=0;
+   static uint8_t prev_B=0;
+   if(y&1) {
+       if((R == prev_R) && (G == prev_G) && (B == prev_B)) {
+	   printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B);
+       } else {
+	   printf("\033[48;2;%d;%d;%dm",(int)prev_R,(int)prev_G,(int)prev_B);	   	   
+	   printf("\033[38;2;%d;%d;%dm",(int)R,(int)G,(int)B);
+	   // https://www.w3.org/TR/xml-entity-names/025.html
+	   // https://onlineunicodetools.com/convert-unicode-to-utf8
+	   printf("\xE2\x96\x83");
+       }
+       if(x == graphics_width-1) {
+	   printf("\033[38;2;0;0;0m");	   
+	   printf("\033[48;2;0;0;0m\n");
+       }
+   } else {
+       prev_R = R;
+       prev_G = G;
+       prev_B = B;
+   }
+#else   
+   printf("\033[48;2;%d;%d;%dm ",(int)R,(int)G,(int)B);
+   if(x == graphics_width-1) {
+       printf("\033[48;2;0;0;0m\n");
+   }
+#endif   
+}
+
+
+// Begins statistics collection for current frame.
+// Leave emtpy if not needed.
+static inline stats_begin_frame() {
+}
+
+// Begins statistics collection for current pixel
+// Leave emtpy if not needed.
+// There are these two levels because on some
+// femtorv32 cores (quark, tachyon), the clock tick counter does not
+// have sufficient bits and will wrap during the time taken by
+// rendering a frame (up to several minutes).
+static inline stats_begin_pixel() {
+}
+
+// Ends statistics collection for current pixel
+// Leave emtpy if not needed.
+static inline stats_end_pixel() {
+}
+
+// Ends statistics collection for current frame
+// and displays result.
+// Leave emtpy if not needed.
+static inline stats_end_frame() {
+}
+
+// Normally you will not need to modify anything beyond that point.
+/*******************************************************************/
+
+typedef struct { float x,y,z; }   vec3;
+typedef struct { float x,y,z,w; } vec4;
+
+static inline vec3 make_vec3(float x, float y, float z) {
+  vec3 V;
+  V.x = x; V.y = y; V.z = z;
+  return V;
+}
+
+static inline vec4 make_vec4(float x, float y, float z, float w) {
+  vec4 V;
+  V.x = x; V.y = y; V.z = z; V.w = w;
+  return V;
+}
+
+static inline vec3 vec3_neg(vec3 V) {
+  return make_vec3(-V.x, -V.y, -V.z);
+}
+
+static inline vec3 vec3_add(vec3 U, vec3 V) {
+  return make_vec3(U.x+V.x, U.y+V.y, U.z+V.z);
+}
+
+static inline vec3 vec3_sub(vec3 U, vec3 V) {
+  return make_vec3(U.x-V.x, U.y-V.y, U.z-V.z);
+}
+
+static inline float vec3_dot(vec3 U, vec3 V) {
+  return U.x*V.x+U.y*V.y+U.z*V.z;
+}
+
+static inline vec3 vec3_scale(float s, vec3 U) {
+  return make_vec3(s*U.x, s*U.y, s*U.z);
+}
+
+static inline float vec3_length(vec3 U) {
+  return sqrtf(U.x*U.x+U.y*U.y+U.z*U.z);
+}
+
+static inline vec3 vec3_normalize(vec3 U) {
+  return vec3_scale(1.0f/vec3_length(U),U);
+}
+
+/*************************************************************************/
+
+typedef struct Light {
+    vec3 position;
+    float intensity;
+} Light;
+
+Light make_Light(vec3 position, float intensity) {
+  Light L;
+  L.position = position;
+  L.intensity = intensity;
+  return L;
+}
+
+/*************************************************************************/
+
+typedef struct {
+    float refractive_index;
+    vec4  albedo;
+    vec3  diffuse_color;
+    float specular_exponent;
+} Material;
+
+Material make_Material(float r, vec4 a, vec3 color, float spec) {
+  Material M;
+  M.refractive_index = r;
+  M.albedo = a;
+  M.diffuse_color = color;
+  M.specular_exponent = spec;
+  return M;
+}
+
+Material make_Material_default() {
+  Material M;
+  M.refractive_index = 1;
+  M.albedo = make_vec4(1,0,0,0);
+  M.diffuse_color = make_vec3(0,0,0);
+  M.specular_exponent = 0;
+  return M;
+}
+
+/*************************************************************************/
+
+typedef struct {
+  vec3 center;
+  float radius;
+  Material material;
+} Sphere;
+
+Sphere make_Sphere(vec3 c, float r, Material M) {
+  Sphere S;
+  S.center = c;
+  S.radius = r;
+  S.material = M;
+  return S;
+}
+
+BOOL Sphere_ray_intersect(Sphere* S, vec3 orig, vec3 dir, float* t0) {
+  vec3 L = vec3_sub(S->center, orig);
+  float tca = vec3_dot(L,dir);
+  float d2 = vec3_dot(L,L) - tca*tca;
+  float r2 = S->radius*S->radius;
+  if (d2 > r2) return 0;
+  float thc = sqrtf(r2 - d2);
+  *t0       = tca - thc;
+  float t1 = tca + thc;
+  if (*t0 < 0) *t0 = t1;
+  if (*t0 < 0) return 0;
+  return 1;
+}
+
+vec3 reflect(vec3 I, vec3 N) {
+  return vec3_sub(I, vec3_scale(2.f*vec3_dot(I,N),N));
+}
+
+vec3 refract(vec3 I, vec3 N, float eta_t, float eta_i /* =1.f */) {
+  // Snell's law
+  float cosi = -max(-1.f, min(1.f, vec3_dot(I,N)));
+  // if the ray comes from the inside the object, swap the air and the media  
+  if (cosi<0) return refract(I, vec3_neg(N), eta_i, eta_t); 
+    float eta = eta_i / eta_t;
+    float k = 1 - eta*eta*(1 - cosi*cosi);
+    // k<0 = total reflection, no ray to refract.
+    // I refract it anyways, this has no physical meaning
+    return k<0 ? make_vec3(1,0,0)
+              : vec3_add(vec3_scale(eta,I),vec3_scale((eta*cosi - sqrtf(k)),N));
+}
+
+BOOL scene_intersect(
+   vec3 orig, vec3 dir, Sphere* spheres, int nb_spheres,
+   vec3* hit, vec3* N, Material* material
+) {
+  float spheres_dist = 1e30;
+  for(int i=0; i<nb_spheres; ++i) {
+    float dist_i;
+    if(
+       Sphere_ray_intersect(&spheres[i], orig, dir, &dist_i) &&
+       (dist_i < spheres_dist)
+    ) {
+      spheres_dist = dist_i;
+      *hit = vec3_add(orig,vec3_scale(dist_i,dir));
+      *N = vec3_normalize(vec3_sub(*hit, spheres[i].center));
+      *material = spheres[i].material;
+    }
+  }
+  float checkerboard_dist = 1e30;
+  if (fabs(dir.y)>1e-3)  {
+    float d = -(orig.y+4)/dir.y; // the checkerboard plane has equation y = -4
+    vec3 pt = vec3_add(orig, vec3_scale(d,dir));
+    if (d>0 && fabs(pt.x)<10 && pt.z<-10 && pt.z>-30 && d<spheres_dist) {
+      checkerboard_dist = d;
+      *hit = pt;
+      *N = make_vec3(0,1,0);
+      material->diffuse_color =
+	(((int)(.5*hit->x+1000) + (int)(.5*hit->z)) & 1)
+	             ? make_vec3(.3, .3, .3)
+	             : make_vec3(.3, .2, .1);
+    }
+  }
+  return min(spheres_dist, checkerboard_dist)<1000;
+}
+
+vec3 cast_ray(
+   vec3 orig, vec3 dir, Sphere* spheres, int nb_spheres,
+   Light* lights, int nb_lights, int depth /* =0 */
+) {
+  vec3 point,N;
+  Material material = make_Material_default();
+  if (
+    depth>2 ||
+    !scene_intersect(orig, dir, spheres, nb_spheres, &point, &N, &material)
+  ) {
+    float s = 0.5*(dir.y + 1.0);
+    return vec3_add(
+	vec3_scale(s,make_vec3(0.2, 0.7, 0.8)),
+        vec3_scale(s,make_vec3(0.0, 0.0, 0.5))
+    );
+  }
+
+  vec3 reflect_dir=vec3_normalize(reflect(dir, N));
+  vec3 refract_dir=vec3_normalize(refract(dir,N,material.refractive_index,1));
+  
+  // offset the original point to avoid occlusion by the object itself 
+  vec3 reflect_orig =
+    vec3_dot(reflect_dir,N) < 0
+               ? vec3_sub(point,vec3_scale(1e-3,N))
+               : vec3_add(point,vec3_scale(1e-3,N)); 
+  vec3 refract_orig =
+    vec3_dot(refract_dir,N) < 0
+               ? vec3_sub(point,vec3_scale(1e-3,N))
+               : vec3_add(point,vec3_scale(1e-3,N));
+  vec3 reflect_color = cast_ray(
+       reflect_orig, reflect_dir, spheres, nb_spheres,
+       lights, nb_lights, depth + 1
+  );
+  vec3 refract_color = cast_ray(
+       refract_orig, refract_dir, spheres, nb_spheres,
+       lights, nb_lights, depth + 1
+  );
+  
+  float diffuse_light_intensity = 0, specular_light_intensity = 0;
+  for (int i=0; i<nb_lights; i++) {
+    vec3  light_dir = vec3_normalize(vec3_sub(lights[i].position,point));
+    float light_distance = vec3_length(vec3_sub(lights[i].position,point));
+
+    vec3 shadow_orig =
+      vec3_dot(light_dir,N) < 0
+                ? vec3_sub(point,vec3_scale(1e-3,N))
+                : vec3_add(point,vec3_scale(1e-3,N)) ;
+    // checking if the point lies in the shadow of the lights[i]
+    vec3 shadow_pt, shadow_N;
+    Material tmpmaterial;
+    if (
+       scene_intersect(
+	 shadow_orig, light_dir, spheres, nb_spheres,
+	 &shadow_pt, &shadow_N, &tmpmaterial
+       ) && (
+  	 vec3_length(vec3_sub(shadow_pt,shadow_orig)) < light_distance
+	     )
+    ) continue ;
+    
+    diffuse_light_intensity  +=
+                  lights[i].intensity * max(0.f, vec3_dot(light_dir,N));
+     
+    float abc = max(
+	           0.f, vec3_dot(vec3_neg(reflect(vec3_neg(light_dir), N)),dir)
+	        );
+    float def = material.specular_exponent;
+    if(abc > 0.0f && def > 0.0f) {
+      specular_light_intensity += powf(abc,def)*lights[i].intensity;
+    }
+  }
+  vec3 result = vec3_scale(
+      diffuse_light_intensity * material.albedo.x, material.diffuse_color
+  );
+  result = vec3_add(
+       result, vec3_scale(specular_light_intensity * material.albedo.y,
+       make_vec3(1,1,1))
+  );
+  result = vec3_add(result, vec3_scale(material.albedo.z, reflect_color));
+  result = vec3_add(result, vec3_scale(material.albedo.w, refract_color));
+  return result;
+}
+
+static inline void render_pixel(
+    int i, int j, Sphere* spheres, int nb_spheres, Light* lights, int nb_lights
+) {
+   const float fov  = M_PI/3.;
+   stats_begin_pixel();
+   float dir_x =  (i + 0.5) - graphics_width/2.;
+   float dir_y = -(j + 0.5) + graphics_height/2.; // this flips the image.
+   float dir_z = -graphics_height/(2.*tan(fov/2.));
+   vec3 C = cast_ray(
+       make_vec3(0,0,0), vec3_normalize(make_vec3(dir_x, dir_y, dir_z)),
+       spheres, nb_spheres, lights, nb_lights, 0
+   );
+   graphics_set_pixel(i,j,C.x,C.y,C.z);
+   stats_end_pixel();
+}
+
+void render(Sphere* spheres, int nb_spheres, Light* lights, int nb_lights) {
+   stats_begin_frame();
+   graphics_init();
+#ifdef graphics_double_lines  
+   for (int j = 0; j<graphics_height; j+=2) { 
+      for (int i = 0; i<graphics_width; i++) {
+	  render_pixel(i,j  ,spheres,nb_spheres,lights,nb_lights);
+	  render_pixel(i,j+1,spheres,nb_spheres,lights,nb_lights);	  
+      }
+   }
+#else
+   for (int j = 0; j<graphics_height; j++) { 
+      for (int i = 0; i<graphics_width; i++) {
+	  render_pixel(i,j  ,spheres,nb_spheres,lights,nb_lights);
+      }
+   }
+#endif
+   graphics_terminate();   
+   stats_end_frame();
+}
+
+int nb_spheres = 4;
+Sphere spheres[4];
+
+int nb_lights = 3;
+Light lights[3];
+
+void init_scene() {
+    Material ivory = make_Material(
+       1.0, make_vec4(0.6,  0.3, 0.1, 0.0), make_vec3(0.4, 0.4, 0.3),   50.
+    );
+    Material glass = make_Material(
+       1.5, make_vec4(0.0,  0.5, 0.1, 0.8), make_vec3(0.6, 0.7, 0.8),  125.
+    );
+    Material red_rubber = make_Material(
+       1.0, make_vec4(0.9,  0.1, 0.0, 0.0), make_vec3(0.3, 0.1, 0.1),   10.
+    );
+    Material mirror = make_Material(
+       1.0, make_vec4(0.0, 10.0, 0.8, 0.0), make_vec3(1.0, 1.0, 1.0),  142.
+    );
+
+    spheres[0] = make_Sphere(make_vec3(-3,    0,   -16), 2,      ivory);
+    spheres[1] = make_Sphere(make_vec3(-1.0, -1.5, -12), 2,      glass);
+    spheres[2] = make_Sphere(make_vec3( 1.5, -0.5, -18), 3, red_rubber);
+    spheres[3] = make_Sphere(make_vec3( 7,    5,   -18), 4,     mirror);
+
+    lights[0] = make_Light(make_vec3(-20, 20,  20), 1.5);
+    lights[1] = make_Light(make_vec3( 30, 50, -25), 1.8);
+    lights[2] = make_Light(make_vec3( 30, 20,  30), 1.7);
+}
+
+int main() {
+    init_scene();
+    render(spheres, nb_spheres, lights, nb_lights);
+    return 0;
+}
--- a/FIRMWARE/tty_graphics.h
+++ b/FIRMWARE/tty_graphics.h
@@ -0,0 +1,173 @@
+#ifndef TTY_GRAPHICS_H
+#define TTY_GRAPHICS_H
+
+#include <stdio.h>
+#include <stdint.h>
+
+/**
+ * \brief Resets default tty colors (white foreground, black background)
+ * \details It is useful to call this function once all graphics are finished,
+ *    else text output might be invisible or difficult to see depending on
+ *    current foreground and background colors.
+ */
+static inline void tty_graphics_reset_colors() {
+    printf("\033[48;5;16m"   // set background color black
+	   "\033[38;5;15m"   // set foreground color white
+    );
+}
+
+/**
+ * \brief Moves the cursor position to the origin (top left).
+ */
+static inline void tty_graphics_home() {
+    printf("\033[H");
+}
+
+/**
+ * \brief Clears the terminal.
+ */
+static inline void tty_graphics_clear() {
+    printf("\033[2J");
+}
+
+/**
+ * \brief Initializes "graphics mode".
+ * \details resets default colors, clears the terminal and moves the 
+ *  cursor to the top-left position.
+ */
+static inline void tty_graphics_init() {
+    tty_graphics_reset_colors();
+    tty_graphics_home();
+    tty_graphics_clear();
+}
+
+/**
+ * \brief Terminates "graphics mode".
+ * \details Restores default foreground and background colors.
+ */
+static inline void tty_graphics_terminate() {
+    tty_graphics_reset_colors();
+}
+
+/**
+ * \brief Moves the cursor to a specific location.
+ */
+static inline void tty_graphics_gotoXY(int x, int y) {
+    printf("\033[%d;%dH",y,x); 
+}
+
+/**
+ * \brief Draws a "pixel" (a block) at the current
+ *  cursor position and advances the current cursor
+ *  position.
+ */
+static inline void tty_graphics_draw_one_pixel(
+    uint8_t r, uint8_t g, uint8_t b
+) {
+    printf("\033[48;2;%d;%d;%dm ",(int)r,(int)g,(int)b);    
+}
+
+/**
+ * \brief Draws two "pixels" at the current
+ *  cursor position and advances the current cursor
+ *  position.
+ * \details Characters are roughly twice as high as wide.
+ *  To generate square pixels, this function draws two pixels in
+ *  the same character, using the special lower-half white / upper-half
+ *  black character, and setting the background and foreground colors.
+ */
+static inline void tty_graphics_draw_two_pixels(
+    uint8_t r1, uint8_t g1, uint8_t b1,
+    uint8_t r2, uint8_t g2, uint8_t b2
+) {
+    if((r2 == r1) && (g2 == g1) && (b2 == b1)) {
+	tty_graphics_draw_one_pixel(r1,g1,b1);
+    } else {
+	printf("\033[48;2;%d;%d;%dm",(int)r1,(int)g1,(int)b1);	   	   
+	printf("\033[38;2;%d;%d;%dm",(int)r2,(int)g2,(int)b2);
+	// https://www.w3.org/TR/xml-entity-names/025.html
+	// https://onlineunicodetools.com/convert-unicode-to-utf8
+	// https://copypastecharacter.com/
+	printf("\xE2\x96\x83");
+    }
+}
+
+/**
+ * \brief Moves the cursor position to the next line.
+ * \details Background and foreground colors are set to black.
+ */
+static inline void tty_graphics_newline() {
+    printf("\033[38;2;0;0;0m");	   
+    printf("\033[48;2;0;0;0m\n");
+}
+
+typedef void (*tty_graphics_pixelfunc)(int x, int y, uint8_t* r, uint8_t* g, uint8_t* b);
+typedef void (*tty_graphics_fpixelfunc)(int x, int y, float* r, float* g, float* b);
+
+/**
+ * \brief Draws an image by calling a user-specified function for each pixel.
+ * \param[in] width , height dimension of the image in square pixels
+ * \param[in] do_pixel the user function to be called for each pixel (a "shader"), that
+ *  determines the (integer) components r,g,b of the pixel's color.
+ * \details Uses half-charater pixels.
+ */
+static inline void tty_graphics_scan(int width, int height, tty_graphics_pixelfunc do_pixel) {
+    uint8_t r1, g1, b1;
+    uint8_t r2, g2, b2;
+    tty_graphics_home();
+    for (int j = 0; j<height; j+=2) { 
+	for (int i = 0; i<width; i++) {
+	    do_pixel(i,j  , &r1, &g1, &b1);
+	    do_pixel(i,j+1, &r2, &g2, &b2);
+	    tty_graphics_draw_two_pixels(r1,g1,b1,r2,g2,b2);
+	    if(i == width-1) {
+		tty_graphics_newline();
+	    }
+	}
+    }
+}
+
+/**
+ * brief Converts a floating point value to a byte.
+ * \param[in] the floating point value in [0,1]
+ * \return the byte, in [0,255]
+ * \details the input value is clamped to [0,1]
+ */ 
+static inline uint8_t tty_graphics_ftoi(float f) {
+    f = (f < 0.0f) ? 0.0f : f;
+    f = (f > 1.0f) ? 1.0f : f;
+    return (uint8_t)(255.0f * f);
+}
+
+/**
+ * \brief Draws an image by calling a user-specified function for each pixel.
+ * \param[in] width , height dimension of the image in square pixels
+ * \param[in] do_pixel the user function to be called for each pixel (a "shader"), that
+ *  determines the (floating-point) components fr,fg,fb of the pixel's color.
+ * \details Uses half-charater pixels.
+ */
+static inline void tty_graphics_fscan(int width, int height, tty_graphics_fpixelfunc do_pixel) {
+    float fr1, fg1, fb1;
+    float fr2, fg2, fb2;
+    uint8_t r1, g1, b1;
+    uint8_t r2, g2, b2;
+    tty_graphics_home();
+    for (int j = 0; j<height; j+=2) { 
+	for (int i = 0; i<width; i++) {
+	    do_pixel(i,j  , &fr1, &fg1, &fb1);
+	    r1 = tty_graphics_ftoi(fr1);
+	    g1 = tty_graphics_ftoi(fg1);
+	    b1 = tty_graphics_ftoi(fb1);	    
+	    do_pixel(i,j+1, &fr2, &fg2, &fb2);
+	    r2 = tty_graphics_ftoi(fr2);
+	    g2 = tty_graphics_ftoi(fg2);
+	    b2 = tty_graphics_ftoi(fb2);	    
+	    tty_graphics_draw_two_pixels(r1,g1,b1,r2,g2,b2);
+	    if(i == width-1) {
+		tty_graphics_newline();
+	    }
+	}
+    }
+}
+
+#endif
--- a/FIRMWARE/tty_graphics_demo.c
+++ b/FIRMWARE/tty_graphics_demo.c
@@ -0,0 +1,38 @@
+#include "tty_graphics.h"
+#include <math.h>
+
+#ifdef __linux__
+#include <stdlib.h>
+#include <unistd.h>
+#endif
+
+// Size of the screen
+// Replace with your own variables or values
+#define graphics_width  80
+#define graphics_height 40
+
+int frame = 0;
+float f = 0.0;
+
+void do_pixel(int i, int j, float* R, float* G, float* B) {
+    float x = (float)i;
+    float y = (float)j;
+    *R = 0.5f*(sin(x*0.1+f)+1.0);
+    *G = 0.5f*(sin(y*0.1+2.0*f)+1.0);
+    *B = 0.5f*(sin((x+y)*0.05-3.0*f)+1.0);
+}
+
+int main() {
+    tty_graphics_init();   
+    for(;;) {
+	tty_graphics_fscan(graphics_width, graphics_height, do_pixel);
+	f += 0.1;
+        ++frame;
+        tty_graphics_reset_colors();
+        printf("frame = %d\n",frame);
+#ifdef __linux__       
+        usleep(40000);
+#endif       
+    }
+    return 0;
+}
--- a/FIRMWARE/wait.S
+++ b/FIRMWARE/wait.S
@@ -0,0 +1,11 @@
+.section .text
+.globl wait
+
+wait:
+        li t0,1
+	slli t0, t0,17
+.L0:       
+        addi t0,t0,-1
+	bnez t0, .L0
+	ret
+
--- a/LESSON1.md
+++ b/LESSON1.md
--- a/README.md
+++ b/README.md
@@ -0,0 +1,8 @@
+## Toolchain
+- Yosys / Yosys NextPNR / Yosys Apicula
+
+## ToDo
+
+- Check documentation Yosys !
+- TOBB labs
+- Anki cards
--- a/step1.v
+++ b/step1.v
@@ -0,0 +1,24 @@
+/**
+ * Step 1: Blinker
+ * DONE
+ */
+
+`default_nettype none
+
+module SOC (
+    input  clk,        // system clock 
+    input  rst_i,      // reset button
+    output [3:0] led, // system LEDs
+    input  RXD,        // UART receive
+    output TXD         // UART transmit
+);
+
+
+// A blinker that counts on 5 bits, wired to the 5 LEDs
+   reg [3:0] count = 0;
+   always @(posedge clk) begin
+      count <= count + 1;
+   end
+   assign led = count;
+   assign TXD  = 1'b0; // not used for now
+endmodule
--- a/step2.v
+++ b/step2.v
@@ -0,0 +1,40 @@
+/**
+ * Step 2: Blinker (slower version)
+ * DONE*
+ */
+
+`default_nettype none
+`include "clockworks.v"
+
+module SOC (
+    input  clk,        // system clock 
+    input  rst_i,      // reset button
+    output [4:0] led, // system LEDs
+    input  RXD,        // UART receive
+    output TXD         // UART transmit
+);
+
+   wire clkI;    // internal clock
+   wire resetn; // internal reset signal, goes low on reset
+   
+   // A blinker that counts on 5 bits, wired to the 5 LEDs
+   reg [4:0] count = 0;
+   always @(posedge clkI) begin
+      count <= !resetn ? 0 : count + 1;
+   end
+
+   // Clock gearbox (to let you see what happens)
+   // and reset circuitry (to workaround an
+   // initialization problem with Ice40)
+   Clockworks #(
+     .SLOW(21) // Divide clock frequency by 2^21
+   )CW(
+     .CLK(clk),
+     .RESET(rst_i),
+     .clk(clkI),
+     .resetn(resetn)
+   );
+   
+   assign led = count;
+   assign TXD  = 1'b0; // not used for now   
+endmodule
--- a/step3.v
+++ b/step3.v
@@ -0,0 +1,65 @@
+/**
+ * Step 3: Display a led pattern "animation" stored in BRAM.
+ * DONE*
+ */
+
+`default_nettype none
+`include "clockworks.v"
+
+module SOC (
+    input  clk,        // system clock 
+    input  rst_i,      // reset button
+    output [4:0] led, // system LEDs
+    input  RXD,        // UART receive
+    output TXD         // UART transmit
+);
+
+   wire clkI;    // internal clock
+   wire resetn; // internal reset signal, goes low on reset
+   
+   reg [4:0] PC = 0;
+   reg [4:0] MEM [0:20];
+   initial begin
+       MEM[0]  = 5'b00000;
+       MEM[1]  = 5'b00001;
+       MEM[2]  = 5'b00010;
+       MEM[3]  = 5'b00100;
+       MEM[4]  = 5'b01000;
+       MEM[5]  = 5'b10000;
+       MEM[6]  = 5'b10001;
+       MEM[7]  = 5'b10010;
+       MEM[8]  = 5'b10100;
+       MEM[9]  = 5'b11000;
+       MEM[10] = 5'b11001;
+       MEM[11] = 5'b11010;
+       MEM[12] = 5'b11100;
+       MEM[13] = 5'b11101;
+       MEM[14] = 5'b11110;
+       MEM[15] = 5'b11111;
+       MEM[16] = 5'b11110;
+       MEM[17] = 5'b11100;
+       MEM[18] = 5'b11000;
+       MEM[19] = 5'b10000;
+       MEM[20] = 5'b00000;       
+   end
+
+   reg [4:0] leds = 0;
+   assign led=leds;
+
+   always @(posedge clkI) begin
+      leds <= MEM[PC];
+      PC <= (!resetn || PC==20) ? 0 : (PC+1);
+   end
+
+   // Gearbox and reset circuitry.
+   Clockworks #(
+     .SLOW(25) // Divide clock frequency by 2^21
+   )CW(
+     .CLK(clk),
+     .RESET(rst_i),
+     .clk(clkI),
+     .resetn(resetn)
+   );
+   
+   assign TXD  = 1'b0; // not used for now   
+endmodule
--- a/step3K.v
+++ b/step3K.v
@@ -0,0 +1,59 @@
+`include "clockworks.v"
+
+module SOC (
+  input clk,
+  input rst_i,
+  output [4:0] led,
+  output TXD,
+  input RXD
+);
+
+wire clkI, resetn;
+
+reg [4:0] PC = 0;
+reg [4:0] MEM [0:20];
+
+   initial begin
+       MEM[0]  = 5'b00000;
+       MEM[1]  = 5'b00001;
+       MEM[2]  = 5'b00010;
+       MEM[3]  = 5'b00100;
+       MEM[4]  = 5'b01000;
+       MEM[5]  = 5'b10000;
+       MEM[6]  = 5'b10001;
+       MEM[7]  = 5'b10010;
+       MEM[8]  = 5'b10100;
+       MEM[9]  = 5'b11000;
+       MEM[10] = 5'b11001;
+       MEM[11] = 5'b11010;
+       MEM[12] = 5'b11100;
+       MEM[13] = 5'b11101;
+       MEM[14] = 5'b11110;
+       MEM[15] = 5'b11111;
+       MEM[16] = 5'b11110;
+       MEM[17] = 5'b11100;
+       MEM[18] = 5'b11000;
+       MEM[19] = 5'b10000;
+       MEM[20] = 5'b00000;       
+   end
+
+  reg [4:0] leds = 0;
+  assign led = leds;
+
+  always @(posedge clkI) begin
+    leds <= MEM[PC]; 
+    PC <= (!resetn || PC == 20) ? 0 : (PC + 1);
+  end
+
+  Clockworks #(
+    .SLOW(21)
+  )clkw(
+    .CLK(clk),
+    .RESET(rst_i),
+    .clk(clkI),
+    .resetn(resetn)
+  );
+
+  assign TXD = 1'b0;
+
+  endmodule