Verilog Basic Notes
General Flow
Create Project
.v(sources).xdc(constraints)
RTL Analysis
Schematic
Simulate Design
_tb.v: test bench- simulation options: 500ns
 - Scopes -> dut -> swt/led
 - Schematic
 
Synthesize Design
Schematic
Implement Design
Schematic
Post-Implementation Timing Simulate Design
Generate Bit Stream
Testing and Verification
low-level (circuit) simulation is much slower than high-level (C, HDL) simulation:
- check only functionality at high level (C, HDL)
 - check only timing, power at low level (circuit)
 
module testBench();
  reg         clk, reset;           // clock and reset are internal
  reg         a, b, c, yExpected;   // values from testVectors
  wire        y;                    // output of circuit
  reg  [31:0] vectorNum, errors;    // bookkeeping variables
  reg  [3:0]  testVectors[10000:0]; // array of testVectors
  // instantiate device under test
  sillyFunction dut(.a(a), .b(b), .c(c), .y(y) );
  // generate clock
  always     // no sensitivity list, so it always executes
    begin
      clk = 1; #5; clk = 0; #5;     // 10ns period
    end
  // at start of test, load vectors and pulse reset
  initial   // Only executes once
  begin
    $readmemb("example.tv", testVectors); // Read vectors: e.g 000_0 001_1 ... xxx_x
    vectorNum = 0; errors = 0;            // Initialize
    reset = 1; #27; reset = 0;            // Apply reset wait
  end
  // Note: $readmemh reads testVector files written in
  // hexadecimal
  // apply test vectors on rising edge of clk
  always @(posedge clk)
  begin
    #1; {a, b, c, yExpected} = testVectors[vectorNum];
  end
  always @(negedge clk)
  begin
    if (~reset) // don’t test during reset
    begin
      if (y !== yExpected)
      begin
        $display("Error: inputs = %b", {a, b, c});
        $display("  outputs = %b (%b exp)",y,yExpected);
        errors = errors + 1;
      end
      // increment array index and read next testVector
      vectorNum = vectorNum + 1;
      if (testVectors[vectorNum] === 4'bx)
      begin
        $display("%d tests completed with %d errors", vectorNum, errors);
        $finish;                 // End simulation
      end
    end
  end
endmodule
Timing in Circuits
Combination Circuit Timing
- contamination delay (
t_cd): minimum path in circuits, outputs start to change - propagation delay (
t_pd): maximum path in circuits, outputs complete change - (delay) heavy dependence on voltage and temperature
 
Sequential Circuit Timing
minimize clock skew time: requires intelligent clock network across a chip, making clock arrives at all locations at roughly the same time.
T_clock >= T_pcq + T_pd + (T_setup + T_skew)
T_ccq + T_cd > (T_hold + T_skew)
Key Words
module
input, output, inout
wire, reg, parameter
always@(), assign
posedge, negedge
if-else, case, begin ... end
port, signal
Module
- Inout(Net) -> (Net)Module(Net) -> Inout(Net)
 - Input(Net/Register) -> (Net)Module(Net/Register) -> Output(Net)
 
module moduleName( In1, In2, Out1, Out2, InOut1);
    // 输入输出定义: 说明端口是输入还是输出
    input In1, In2;
    output Out1, Out2;
    inout InOut1;
    // 数据类型定  义: 说明端口数据类型 - Net/Register
    wire In1, In2, Out1;
    wire InOut1;
    reg Out2;
    // Instantiation of lower level modules
    Design u_2(.(端口(信号1), .(端口2(信号2), .(端口3(信号3));
    // Functionality
    // 三种层次的描述: 逻辑层次(Gate Level), 数据流层次(Dataflow Level), 行为层次(Behavior Level)
    // and/or - gate level
    and and1( Out1, In1, In2);
    // assign - dataflow level
    assign #2 Out1 = In1 & In2;
    // always/initial - behavior level
    always @(*)
        begin
            Out2 = In1 & In2
        end
    // Timing specification
endmodule
外部端口
- 封装内部端口,装饰者模式
 - 一个内部端口可与多个外部端口相连
 
module scram_b (
    .data(arb),
    .control(ctrl),
    .mem_word(mem_blk),
    .addr(byte)
);
    input [0:3] arb;
    input ctrl;
    input [8:0] mem_blk;
    output [0:3] byte;
endmodule
function
- 不含时间/事件控制
 - 至少 1 个输入
 - 至多 1 个输出
 - 只含行为模块
 - 只含阻塞赋值语句
 
function  [7: 0]  aligned_word;    // function declaration
    input  [7: 0]  word;
    begin
      aligned_word = word;
      if (aligned_word != 0)
        while (aligned_word[7] == 0) aligned_word = aligned_word << 1;
    end
  endfunction
module arithmetic_unit (result_1, result_2, operand_1, operand_2,);
  output   [4: 0] result_1;
  output  [3: 0] result_2;
  input   [3: 0] operand_1, operand_2;
  assign result_1 = sum_of_operands (operand_1, operand_2);
  assign result_2 = larger_operand (operand_1, operand_2);
  function [4: 0] sum_of_operands(input [3:0] operand_1, operand_2);
    sum_of_operands = operand_1 + operand_2;
  endfunction
  function [3: 0] larger_operand(input [3:0] operand_1, operand_2);
    larger_operand = (operand_1 >= operand_2) ? operand_1 : operand_2;
  endfunction
endmodule
task
将测试流程分为多个任务:
- 初始化任务
 - 模拟生成任务
 - 自我检查任务
 
module adder_task (c_out, sum, clk, reset, c_in, data_a, data_b);
  output reg [3: 0]  sum;
  output reg  c_out;
  input  [3: 0]  data_a, data_b;
  input   clk, reset, c_in;
  always @(posedge clk or posedge reset) begin
    if (reset) {c_out, sum} <= 0;
    else add_values (sum, c_out, data_a, data_b, c_in); // invoke task
  end
  task add_values; // task declaration
    output reg [3: 0]  SUM;
    output reg  C_OUT;
    input  [3: 0]  DATA_A, DATA_B;
    input   C_IN;
            {C_OUT, SUM} = DATA_A + (DATA_B + C_IN);
   endtask
endmodule
常用的 task 有: monitor("fmt", ...), finish
Data Structure
常量
- 0: 逻辑 0
 - 1: 逻辑 1
 - x/X: Unknown/Floating
 - z/Z: 高阻抗状态(High Impedance)
 - parameter: #define
 
localparam idle = 2'b00;
parameter Bit = 8, cnt_up = 1'b1;
output [Bit - 1:0] A;
reg [Bit - 1:0] A;
A = A + cnt_up;
向量
[MSB: LSB] 或 [LSB: MSB]
output [3:0] A;    // 4 bit
reg [0:3] B, C;    // 4 bit
wire [63:0] D;    // 64 bit
wire E;               // 1 bit
A[2:1] = B[0] & C[1:2];    // A[2] = 0 & C[1], A[1] = B[0] & C[2]
B[0:1] = 0;
C = 4'b1011;                   // C[0] = 1, C[1] = 0, C[2] = 1, C[3] = 1
integer [7:0] A [3:0];        // 4元素数组. 每个元素为16bit元
reg B [3:0] [15:0];            // arr[4][16]数组, 每个元素为1bit元
reg [7:0] C [3:0] [15:0];    // arr[4][16]数组, 每个元素为8bit元
A[3] = 0;                     // 置零A中第三个元素(8 bit)
A = 0;                        // 置零A
B[1][0] = 1;                  // B[1][0](1 bit)置1
C[0][0][3:0] = 4'b0010;      // C[0][0] 低4bit为0010
C[2][8][5] = 1;               // C[2][8] 第5bit为1
部分位选
vector[base_expr+: const_width];
vector[base_expr-: const_width];
inst_mode[mark+:2]; // => mark,mark+1
gpio_mode[mark-:4]; // => mark,mark-1,mark-2,mark-3
数字
// size ' signed base value
<Bits长度>'[signed]<进制><数值>
- 位长不能用表达式表示,只可用固定的 parameter
 
Num = 5'b01101;               // 二进制
Num = 22;                     // 十进制
Num = 12'b0000_1111_0000;    // 可读性
Num = 4'hf;                  // 十六进制(1111)
Num = 4'bxxx1;              // 前三位未知
Num = 4'bz01;               // 前两位为z, 后两位为01
有符号数
- signed reg
 - signed wire
 - integer
 - 'sxx
 
无符号数
- reg
 - wire
 - 'xx
 
Register
- reg/integer/time/real/realtime
 - 有记忆性
 - 默认值: x
 
integer
长度为 32 Bit, 补码表示, 常用于计数器
always @(posedge CLK)
    begin
        integer i;
        for (i = 0;i <= 7; i = i + 1) tmp[i] = In;
    end
real
- real 默认值为 0,不可为 x/z
 - 不可声明位宽
 
Net
- wire/wand/wor
 - 局部变量, 没有记忆性
 - 默认值: z
 - wire 间不可直接相连, wand/wor 间课直接相连
 - 可用 wire 定义局部变量
 
Gate Level
Basic Gate
- and
 - nand(与非)
 - or
 - nor(或非)
 - xor(异或)
 - xnor(同或)
 
Use Gate
- 同一模块中, 实例名不能与线网名相同
 
and (w1, In1, In2);        // w1 = Int and In2
or or1(w2, w1, In2);      // w2 = w1 or In2
xor xor(Out, w1, w2);    // Out = w1 xor w2
- 实例数组
 
wire [3:0] irq, ctrl, sense;
/*
 * =>
 * nand
 *      u8nand3 (irq[3], ctrl[3], sense[3]);
 *      u8nand2 (irq[2], ctrl[2], sense[2]);
 *      u8nand1 (irq[1], ctrl[1], sense[1]);
 *      u8nand0 (irq[0], ctrl[0], sense[0]);
 */
nand u8nand [3:0] (irq, ctrl, sense);
parameter NUM_BITS = 4;
wire [NUM_BITS - 1 : 0] gated_d, din;
wire bypass;
and #(1, 2) u0and [NUM_BITS - 1: 0] (gated_d, din, bypass);
Self-Defined Gate(用户自定义原语)
- 可以有一个/多个输入
 - 只能有一个输出
 - 第一个端口必须是输出端口
 -表示 值"无变化"
primitive XOR2 (D_OUT, X1, X2);
    input X1, X2;
    output D_OUT;
    table // X1 X2 : D_OUT
        0 0 : 0;
        0 1 : 1;
        1 0 : 1;
        1 1 : 0;
    endtable
endprimitive
Dataflow Level
- assign net = net/reg: 左式只能是 net
 
Operators
赋值: <=, =
>, <, <=, >=
!=. ==
[ ]. { }
<<, >>
+, -, *, /, %
整数提升
- 表达式所有中间取 最大位宽(最长(左/右)操作数)
 
可实现 haskell 中的模式匹配
// 连接运算符
A = { 1'b0, 1'b1};                // A = 2'b01
A = { B[1:0], C[0], D[2] };    // A = B[1], B[2], C[0], D[2]
A = { 2{2'b01} };                  // A = 4'b0101
A = { 3'b101, 2{1'b0} };        // A = 5'b101_00
Behavior Level
reg = net/reg: 左式只能是reg.
时延控制
#num
parameter cycle = 30;
# 2
# cycle/2
事件控制
@(*);
@( sel, a, b);
@(sel or a or b);
@(posedge CLK);
@(negedge CLK);
语句内/间控制
q = @(posedge clk_iol) d; // 语句内事件控制
@(posedge clk_iol)        // 语句间事件控制
    q = temp;
always
always @(事件1, 事件2, ...)
    begin
        ...;
    end
if-else
- 必须添加 else
 
if (condition1)
    begin
        ...;
    end
else if (condition2)
    begin
        ...;
    end
else
    begin
        ...;
    end
Case Statement
- expr: 常量/变量/连接运算符/x/z
 - casex: 当输入某一位为 x/z 时,忽略此位匹配(恒将此位匹配为真)
 - casez: 当输入某一位为 z 时,忽略此位匹配(恒将此位匹配为真)
 
case (expr)
    item 1:
        begin
            ...;
        end
    item 2:
        begin
            ...;
        end
    item 3:
        begin
            ...;
        end
    default:
        ...;
endcase
for
for (循环初值; 循环条件; 控制部分)
    begin
        ...;
    end
repeat loop
- initial for test bench
 - 当需 if/else 进行断言时,注意 延时 造成的错误逻辑
 
// 重复事件控制:
// 先计算好右值, 等待时钟 clk 上出现2个负跳变沿, 再把右值赋给 result
result = repeat (2) @(negedge clk) hw_data + hr_data;
// repeat 循环语句:
repeat (2)
    @(posedge clk) result = hw_data + hr_data;
initial begin
    inc_DAC = 1’b1;
    repeat(4095) @(posedge clk); // bring DAC right up to point of rollover
    inc_DAC = 1’b0;
    inc_sym = 1’b1;
    repeat(7)@(posedge clk); // bring sample count up to 7
    inc_sym = 1’b0;
end
initial begin
    #100 $finish; // run simulation for 100 units
end
forever loop
// $stop, $finish 可以终止 forever loop
forever #10 clk = ~ clk;
Force and Release
initial
    begin
        force test_reset = penalty & rtc_intr;
        #5;
        release test_reset;
    end
Blocking and Non-Blocking
- Blocking(
=): 顺序执行 - Non-Blocking(
<=): 并行执行 
output = input_logic;
output <= input_logic;
disable
begin : break
    for (i = 0; i < n; i = i+1) begin : continue
        @(posedge clk)
        if (a == 0) // "continue" loop
            disable continue;
        if (a == b) // "break" from loop
            disable break;
        statement1
        statement2
    end
end
结构建模
generate 语句
generate
    for (gv_i = 0; gv_i < SIZE; gv_i = gv_i + 1)
        begin: blk
            xor uxor (y[gv_i], a[gv_i], b[gv_i]);
        end
endgenerate
// =>
// module.blk[0].uxor
// module.blk[1].uxor
// module.blk[2].uxor
// ...
Delay(时延)
- 语句内时延
 - 语句间时延
 - 语句内时延期间:右值保持稳定不变,才可成功赋给左值
 
sum = (a ^ b) ^ cin;
#4 t1 = a & cin;
预编译指令
define 宏
将多个 define 宏,放至 _defines.v, 作为全局宏.
Data Path
Multiplexer
Adder
Register Data Path
Memory
- 其中数据文件中地址必须在系统任务中定义的范围内,系统任务中定义的地址必须在存储器定义的地址范围内
 - 优先考虑数据文件中的地址>系统任务中定义的起始地址和结束地址>存储器定义的起始地址和结束地址
 
Demos
Binary Multiplier
   1100 (the multiplicand)
x  1011 (the multiplier)
   ----
   0000 (initial partial product, start with 0000)
   1100 (1st multiplier bit is 1, so add the multiplicand)
   ----
   1100 (sum)
   ----
   01100 (shift sum one position to the right)
   1100 (2nd multiplier bit is 1, so add multiplicand again)
   ----
  100100 (sum, with a carry generated on the left)
   ----
   100100 (shift sum once to the right, including carry)
   0100100 (3rd multiplier bit is 0, so skip add, shift once)
   ----
   1100 (4th multiplier bit is 1, so add multiplicand again)
   ----
  10000100 (sum, with a carry generated on the left)
   10000100 (shift sum once to the right, including carry)
Multi-Hz
/*
 * 利用计数器实现任意分频
 */
always @(posedge f_clk) begin
    //设定频率控制字p
    if (i == p) begin
        i=0;
        f_out=~f_out;
    end
    else begin
        i=i+1;
    end
end
Tips
不可综合结构
- initial: 只用于 test bench
 - events: Events 同步测试各个组件
 - real: Real 数据类型不可综合
 - time: Time 数据类型不可综合
 - force/release
 - assign(reg)/deassign(reg)
 - fork join
 - primitive: 只有门级的原语(primitives)可综合
 - table: 用户自定义原语(UDP)及 table 不可综合
 #1延迟只用于仿真,综合器直接忽略延迟
混合编程
- 内部变量用 assign 赋值
 - 输出变量通过监听 内部变量 改变输出值
 
    assign DT0 = ...;
    assign DT1 = ...;
    always @(DT0) begin
        AOut <= DT0;
    end
    always @(DT1) begin
        BOut <= DT1;
    end
上升沿/下降沿
    always @(posedge A or negedge B) begin
        if (A) ...
        else if (!B) ...
        else ...
    end
Parameter
- 只在定义的模块内部起作用
 
Overload Method
module data_path
#(parameter DATA_WIDTH = 8)
(
    input A,
    input [(DATA_WIDTH - 1): 0] B,
    output [(DATA_WIDTH - 1): 0] C
);
    ......
endmodule
module data_path_tb
(
);
    data_path #(.DATA_WIDTH(16)) DUT (.A(A), .B(B), .C(C));
    ......
endmodule
Constant Variable
reset_value = {{(DATA_WIDTH/2){1'b0}}, {(DATA_WIDTH/2){1'b1}}};
Test Bench
always begin
    clk = 0;
    forever #DELAY clk = ~clk;
end
reg clock;
integer no_of_clocks;
parameter CLOCK_PERIOD = 5;
parameter TIME = 50000;
initial no_of_clocks = 0;
initial clock = 1'b0;
always #(CLOCK_PERIOD/2.0) clock = ~clock;
always @(posedge clock)
    no_of_clocks = no_of_clocks +1 ;
initial begin
    #TIME;
    $display("End of simulation time is %d ,
      total number of clocks seen is %d expected is %d",$time,no_of_clocks,($time/5));
    $finish;
end
有限状态机(FSM)
- reset: initial state
 - default: illegal/unreachable state
 
算术状态机(ASM)
- state box: moore fsm
 - conditional box: mealy fsm
 - decision box: 
x_input= 0/1 
System Verilog
Enum
typedef enum logic [2:0] {
  RED, GREEN, BLUE, CYAN, MAGENTA, YELLOW
} color_t;
color_t my_color = GREEN;
initial $display("The color is %s", my_color.name());
Struct and Union
typedef struct packed {
  bit [10:0]  expo;
  bit         sign;
  bit [51:0]  man;
} FP;
FP zero = 64'b0;
Procedural Block
- always_comb: 用于组合逻辑电路(相当于 Verilog 中对所有输入变量电平敏感的 always,但 always_comb 无需手动列出所有输入变量,系统会自动识别)
 - always_ff: 用于触发器及相关的时序逻辑电路(相当于 Verilog 中对某个或某几个信号有效跳变沿敏感、并带有信号储存特性的 always)
 - always_latch: 用于锁存器级相关的时序逻辑电路(相当于 Verilog 中对某个或某几个信号电平敏感、并带有信号储存特性的的 always)
 
always_comb begin
  tmp = b * b - 4 * a * c;
  no_root = (tmp < 0);
end
always_ff @(posedge clk)
  count <= count + 1;
always_latch
  if (en) q <= d;
Interface
interface interfaceName;
  logic a;
  logic b;
  modport in (input a, output b);
  modport out (input b, output a);
endinterface
module top;
  interfaceName i ();
  u_a m1 (.i1(i));
  u_b m2 (.i2(i));
endmodule
module u_a (interfaceName.in i1);
endmodule
module u_b (interfaceName.out i2);
endmodule
Testing
module top;
  integer num_packets = $random;
  reg A, B, C, clk, reset_n;
  wire D;
  register_logic dut(A, B, C, clk, reset_n, D);
  // generate clock
  // ...
  initial begin
    run();
  end
  task run();
    reset_n  = 1;
    #20 reset_n = 0;
    @(posedge clk) reset_n <= #1 1;
    repeat (num_packets) begin
      A = $random; B = $random; C = $random;
      @(posedge clk);
      $display(A, B, C, D);
    end
    $finish;
  endtask
endmodule
class Packet;
  string name;
  rand bit[3:0] sa, da;
  rand reg A, B, C;
  function void display(result);
    $display(A, B, C, result);
  endfunction
endclass: Packet
// inheritance
class Packet_da_3 extends Packet;
  constraint da_3 {
    da == 3;
  }
  function void display(result);
    super.display(result);
    $display(sa, da);
  endfunction
endclass: Packet_da_3
class Generator;
  Packet pkt;
  Channel out_chan;
  int num_packets;
  function void gen();
    pkt = new():
    pkt.randomize();
    out_chan.put(pkt);
  endfunction
  task run();
    while (num_packets-- != 0)
      gen();
  endtask
endclass
class Driver;
  Channel in_chan;
  task send();
    in_chan.get(pkt);
    top.A = pkt.A;
    top.B = pkt.B;
    top.C = pkt.C;
    @(posedge top.clk);
  endtask
  task run();
    forever send();
  endtask
endclass
module top;
  initial begin
    build();
    run();
  end
  task build();
    Config cfg = new();
    Channel chan = new();
    Generator gen = new();
    Driver drv = new();
    gen.out_chan = chan;
    drv.in_chan = chan;
    cfg.randomize() with { num_packets > 1500; }
    gen.num_packets = cfg.num_packets;
  endtask
  task run();
    fork
      gen.run();
      drv.run();
    join
    $finish;
  endtask
endmodule
U280 Platform
-xp param (clock frequency etc.)
-R report level
-slr SLR region setting
-sp memory resources mapping
tools:
- xbutil query
 - platforminfo
 - kernelinfo
 - xclbinutil
 - dmesg
 
Host Application
Basic Flow
- set the kernel arguments before performing any enqueue operation
 - keeping the buffer size 2 MB ~ 4 GB
 posix_memalignis used instead of malloc for the host memory space pointer
uint32_t *a, *b, *c, *d = NULL;
posix_memalign((void **)&a, 4096, BUF_SIZE * sizeof(uint32_t));
posix_memalign((void **)&b, 4096, BUF_SIZE * sizeof(uint32_t));
posix_memalign((void **)&c, 4096, BUF_SIZE * sizeof(uint32_t));
posix_memalign((void **)&d, 4096, BUF_SIZE * sizeof(uint32_t));
- release resources for proper performance profile report
 
clReleaseCommandQueue(Command_Queue);
clReleaseContext(Context);
clReleaseDevice(Target_Device_ID);
clReleaseKernel(Kernel);
clReleaseProgram(Program);
free(Platform_IDs);
free(Device_IDs);
TLP
It is advisable to use the posix_spawn() system call
to launch another process from the SDAccel environment application.
U280 Tools
GDB Based Debugging
xprint queue [<cl_command_queue>]
xprint event <cl_event>
xprint mem [<cl_mem>]
xprint kernel
xprint all
xstatus all
xstatus --<ipName>
XCL Binary Util
xclbinutil -i binary_container_1.xclbin --info
XOCC
Checking out-of-bound access made by kernel interface buffers (option: address) and uninitialized memory access initiated by kernel local to kernel (option: memory).
xocc -l –t sw_emu --xp param:compiler.fsanitize=address -o bin_kernel.xclbin
xocc -l –t sw_emu --xp param:compiler.fsanitize=memory -o bin_kernel.xclbin
xocc -l –t sw_emu --xp param:compiler.fsanitize=address,memory -o bin_kernel.xclbin
XBUtil
sudo /opt/xilinx/xrt/bin/xbutil flash -a <shell_name> # flash the firmware
sudo lspci -vd 10ee:
sudo /opt/xilinx/xrt/bin/xbutil flash scan
sudo /opt/xilinx/xrt/bin/xbutil validate -d <card_id>
xbutil program -p <xclbin>
xbutil query         # check memory banks usage
xbutil status --lapc # check AXI violations
dmesg
ILA Trigger
- debug protocol hangs
 - examine the burst size, pipelining and data width to locate the bottleneck
 
....
std::string binaryFile = xcl::find_binary_file(device_name,"vAdd");
cl::Program::Binaries bins = xcl::import_binary_file(binaryFile);
devices.resize(1);
cl::Program program(context, devices, bins);
cl::Kernel kernel_vAdd(program,"kernel_vAdd_rtl");
// wait_for_enter("\nPress ENTER to continue after setting up ILA trigger...");
std::cout << "Pausing to arm ILA trigger. Hit enter here to resume host program..."
          << std::endl;
std::cin::get();
//Allocate Buffer in Global Memory
std::vector<cl::Memory> inBufVec, outBufVec;
cl::Buffer buffer_r1(context,CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
vector_size_bytes, source_input1.data());
// ...
// ...
// ...
//Copy input data to device global memory
q.enqueueMigrateMemObjects(inBufVec,0/* 0 means from host*/);
//Set the Kernel Arguments
// ...
// ...
// ...
//Launch the Kernel
q.enqueueTask(kernel_vAdd);
AXI Protocol
Advanced eXtensible Interface Protocol:
- handshake protocol: ready-valid protocol
 
AXI Channels
Read Address Channel
- arburst: burst type
 - araddr: start address
 - arlen: (# of transfers) - 1
 - arsize: bytes/transfer
 - arready (memory to host)
 - arvalid
 
Read Data Channel
- rdata: data
 - rresp: response (failure check)
 - rlast: flag for last piece of data
 - rready (host to memory)
 - rvalid
 
Write Address Channel
- awburst: burst type
 - awaddr: start address
 - awlen: (# of transfers) - 1
 - awsize: bytes/transfer
 - awready (memory to host)
 - awvalid
 
Write Data Channel
- wdata: data
 - wstrb: write strobe -> write mask (1 bit mask for 1 byte data)
 - wlast: flag for last piece of data
 - wready (memory to host)
 - wvalid
 
Write Response Channel
- bresp: response (failure check)
 - bready (host to memory)
 - bvalid
 
AXI Burst
| AxBURST[1:0] | Burst Type | 
|---|---|
| 0b00 | FIXED | 
| 0b01 | INCR | 
| 0b10 | WRAP | 
| 0b11 | Reserved | 
burst length = AxLEN[7:0] + 1 (up to 256 transfers in each burst)
| AxSIZE[2:0] | Bytes in Transfer | 
|---|---|
| 0b000 | 1 | 
| 0b001 | 2 | 
| 0b010 | 4 | 
| 0b011 | 8 | 
| 0b100 | 16 | 
| 0b101 | 32 | 
| 0b110 | 64 | 
| 0b111 | 128 | 
Read Burst

Write Burst

Verilog Components
Clock Unit
/**
 * @module tick_divider
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief simple clock divider
 * @param DATA_WIDTH data width
 * @input clk_src clock signal
 * @output clk_group divided clock signals
 */
module tick_divider
#(parameter DATA_WIDTH = 32)
(
    input clk_src,
    output reg [(DATA_WIDTH-1): 0] clk_group
);
    initial begin
        clk_group <= {(DATA_WIDTH){1'b0}};
    end
    always @(posedge clk_src) begin
        clk_group <= clk_group + 1;
    end
endmodule
Signal Unit
/**
 * @module integer_to_segment
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief raw data to segment encoder
 * @param DATA_WIDTH data width
 * @input data raw decimal data (4 bit)
 * @output seg_data bit data for cNodes
 */
module integer_to_segment
(
    input [3:0] int_data,
    output reg [7:0] seg_data
);
    always @(int_data) begin
        case (int_data)
            4'b0000:    seg_data <= 8'b11000000;    // 0
            4'b0001:    seg_data <= 8'b11111001;    // 1
            4'b0010:    seg_data <= 8'b10100100;    // 2
            4'b0011:    seg_data <= 8'b10110000;    // 3
            4'b0100:    seg_data <= 8'b10011001;    // 4
            4'b0101:    seg_data <= 8'b10010010;    // 5
            4'b0110:    seg_data <= 8'b10000010;    // 6
            4'b0111:    seg_data <= 8'b11111000;    // 7
            4'b1000:    seg_data <= 8'b10000000;    // 8
            4'b1001:    seg_data <= 8'b10010000;    // 9
            4'b1010:    seg_data <= 8'b10001000;    // a
            4'b1011:    seg_data <= 8'b10000011;    // b
            4'b1100:    seg_data <= 8'b11000110;    // c
            4'b1101:    seg_data <= 8'b10100001;    // d
            4'b1110:    seg_data <= 8'b10000110;    // e
            4'b1111:    seg_data <= 8'b10001110;    // f
            default:    seg_data <= 8'b11111111;    // off
        endcase
    end
endmodule
/**
 * @module data_to_segment
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief raw data to segment encoder
 * @param DATA_WIDTH data width
 * @input data raw decimal data
 * @output seg_data bit data for cNodes
 */
module data_to_segment
#(parameter DATA_WIDTH = 32)
(
    input [(DATA_WIDTH-1):0] data,
    output [(DATA_WIDTH*2)-1:0] seg_data
);
    integer_to_segment trans1 (
        .int_data(data[3:0]),
        .seg_data(seg_data[7:0])
    );
    integer_to_segment trans2 (
        .int_data(data[7:4]),
        .seg_data(seg_data[15:8])
    );
    integer_to_segment trans3 (
        .int_data(data[11:8]),
        .seg_data(seg_data[23:16])
    );
    integer_to_segment trans4 (
        .int_data(data[15:12]),
        .seg_data(seg_data[31:24])
    );
    integer_to_segment trans5 (
        .int_data(data[19:16]),
        .seg_data(seg_data[39:32])
    );
    integer_to_segment trans6 (
        .int_data(data[23:20]),
        .seg_data(seg_data[47:40])
    );
    integer_to_segment trans7 (
        .int_data(data[27:24]),
        .seg_data(seg_data[55:48])
    );
    integer_to_segment trans8 (
        .int_data(data[31:28]),
        .seg_data(seg_data[63:56])
    );
endmodule
/**
 * @module led_unit
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief led display module (bind to aNodes and cNodes in FPGA)
 * @param DATA_WIDTH data width
 * @input clk_src clock signal (light different led on in round turn)
 * @input led_data raw decimal data
 * @output aNodes determine which led light on at now
 * @output cNodes determine how led light on (number)
 */
module led_unit
#(parameter DATA_WIDTH = 32)
(
    input clk_src,
    input [(DATA_WIDTH-1):0] led_data,
    output reg [7:0] aNodes,
    output reg [7:0] cNodes
);
    reg [2:0] count; // 2^3 = 8
    wire [(DATA_WIDTH*2)-1:0] seg_data;
    initial begin
        count <= 0;
        aNodes <= 0;
        cNodes <= 0;
    end
    data_to_segment #(
        .DATA_WIDTH(DATA_WIDTH)
    ) data_to_segment (
        .data(led_data),
        .seg_data(seg_data)
    );
    always @(posedge clk_src) begin
        count = count + 1;
    end
    always @(count) begin
        case (count)
            3'b000: begin
            aNodes = 8'b11111110;
            cNodes = seg_data[7:0];
        end
        3'b001: begin
            aNodes = 8'b11111101;
            cNodes = seg_data[15:8];
        end
        3'b010:  begin
            aNodes = 8'b11111011;
            cNodes = seg_data[23:16];
        end
        3'b011: begin
            aNodes = 8'b11110111;
            cNodes = seg_data[31:24];
        end
        3'b100: begin
            aNodes = 8'b11101111;
            cNodes = seg_data[39:32];
        end
        3'b101: begin
            aNodes = 8'b11011111;
            cNodes = seg_data[47:40];
        end
        3'b110: begin
            aNodes = 8'b10111111;
            cNodes = seg_data[55:48];
        end
        3'b111: begin
            aNodes = 8'b01111111;
            cNodes = seg_data[63:56];
        end
        default: begin
            aNodes = 8'b11111110;
            cNodes = 8'b11111111;
        end
        endcase
    end
endmodule
ALU Unit
/**
 * @module counter
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief up counter
 * @param DATA_WIDTH data width
 * @param STEP counting step
 * @input clk clock signal
 * @input rst reset signal
 * @output en enable signal
 * @output count counting value
 */
module counter
#(parameter DATA_WIDTH = 1, STEP = 1)
(
    input clk,
    input rst,
    input en,
    output reg [(DATA_WIDTH-1):0] count
);
    always @(posedge clk) begin
        if (rst) begin
            count <= 0 ;
        end else if (en) begin
            count <= count + 1;
        end else begin
            count <= count;
        end
    end
endmodule // counter
/**
 * @module latch_counter
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief latch counter (latching when reaching max value)
 * @input clk clock signal
 * @input rst reset signal
 * @output en enable signal
 * @output count counting value
 */
module latch_counter
(
    input clk,
    input rst,
    input en,
    output reg count
);
    initial begin
        count <= 0;
    end
    always @(posedge clk) begin
        if (rst) begin
            count <= 0 ;
        end else if (en) begin
            if (count != 1) begin
                count <= count + 1;
            end else begin
                count <= count;
            end
        end else begin
            count <= count;
        end
    end
endmodule // latch_counter
/**
 * @module alu_flags
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief get flags after alu calculation
 * @param DATA_WIDTH data width
 * @input srcA A port data
 * @input srcB B port data
 * @input aluOP operation code
 * @output zero equal flag
 * @output of signed overflow flag
 * @output uof unsigned overflow flag
 */
module alu_flags
#(parameter DATA_WIDTH = 32)
(
    input [DATA_WIDTH-1:0] srcA,
    input [DATA_WIDTH-1:0] srcB,
    input [3:0] aluOP,
    output zero,
    output of,
    output uof
);
    wire [DATA_WIDTH-1:0] sum, diff;
    wire carry1, carry2;
    assign {carry1, sum} = srcA + srcB;    // awesome tip
    assign {carry2, diff} = srcA - srcB;    // awesome tip
    assign zero = (srcA == srcB);
    assign of = (aluOP == 4'd5) ? (
      (srcA[DATA_WIDTH-1] & srcB[DATA_WIDTH-1] & ~sum[DATA_WIDTH-1])
      | (~srcA[DATA_WIDTH-1] & ~srcB[DATA_WIDTH-1] & sum[DATA_WIDTH-1]))
                : (aluOP == 4'd6) ? (
      (srcA[DATA_WIDTH-1] & ~srcB[DATA_WIDTH-1] & ~diff[DATA_WIDTH-1])
      | (~srcA[DATA_WIDTH-1] & srcB[DATA_WIDTH-1] & diff[DATA_WIDTH-1]))
                : 0;
    assign uof = (aluOP == 4'd5) ? (carry1)
                : (aluOP == 4'd6) ? (carry2)
                : 0;
endmodule // alu_flags
/**
 * @module alu
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @param DATA_WIDTH data width
 * @input srcA A port data
 * @input srcB B port data
 * @input aluOP operation code
 * @output aluOut calculation result
 * @output zero equal flag
 * @output of signed overflow flag
 * @output uof unsigned overflow flag
 */
module alu
#(parameter DATA_WIDTH = 32)
(
    input [DATA_WIDTH-1:0] srcA,
    input [DATA_WIDTH-1:0] srcB,
    input [3:0] aluOP,
    output reg [DATA_WIDTH-1:0] aluOut,
    output zero,
    output of,
    output uof
);
    wire signed [DATA_WIDTH-1:0] signed_srcA;
    wire signed [DATA_WIDTH-1:0] signed_srcB;
    assign signed_srcA = $signed(srcA);
    assign signed_srcB = $signed(srcB);
    always @ ( * ) begin
        case (aluOP)
            4'd0: aluOut <= srcA << srcB;
            4'd1: aluOut <= signed_srcA >>> srcB;
            4'd2: aluOut <= srcA >> srcB;
            4'd3: aluOut <= srcA * srcB;
            4'd4: aluOut <= srcA / srcB;
            4'd5: aluOut <= srcA + srcB;  // awesome tip
            4'd6: aluOut <= srcA - srcB;
            4'd7: aluOut <= srcA & srcB;
            4'd8: aluOut <= srcA | srcB;
            4'd9: aluOut <= srcA ^ srcB;
            4'd10: aluOut <= ~(srcA | srcB);
            4'd11: aluOut <= (signed_srcA < signed_srcB) ? 1 : 0;
            4'd12: aluOut <= (srcA < srcB) ? 1 : 0;
            default: aluOut <= 0;
        endcase
    end
    alu_flags #(
        .DATA_WIDTH(DATA_WIDTH)
    ) FLAGS  (
        .srcA(srcA),
        .srcB(srcB),
        .aluOP(aluOP),
        .zero(zero),
        .of(of),
        .uof(uof)
    );
endmodule // alu
Memory Unit
/**
 * @module register
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief D flip flop
 * @param DATA_WIDTH data width
 * @input clk clock signal
 * @input rst reset signal
 * @input en enable signal
 * @input din data in
 * @output dout data out
 */
module register
#(parameter DATA_WIDTH = 32)
(
    input clk,
    input rst,
    input en,
    input [DATA_WIDTH-1:0] din,
    output reg [DATA_WIDTH-1:0] dout
);
    always @ (posedge clk) begin
        if (rst) begin
            dout <= 0;      // reset
        end else if (en) begin
            dout <= din;    // update
        end else begin
            dout <= dout;   // hold
        end
    end
endmodule // register
/**
 * @module regFile
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief register files for MIPS CPU, contains 32 D flip-flop registers
 * @param DATA_WIDTH data width
 * @input clk clock signal
 * @input we write enable signal
 * @input raddrA read address (No.register) for A out port
 * @input raddrB read address (No.register) for B out port
 * @input waddr write address (No.register) for wdata (in port)
 * @input wdata data to write into regFile
 * @output regA A port output
 * @output regB B port output
 */
module regFile
#(parameter DATA_WIDTH = 32)
(
    input clk,
    input rst,
    input we,
    input [4:0] raddrA,
    input [4:0] raddrB,
    input [4:0] waddr,
    input [DATA_WIDTH-1:0] wdata,
    output [DATA_WIDTH-1:0] regA,
    output [DATA_WIDTH-1:0] regB,
    output [DATA_WIDTH-1:0] v0_data,
    output [DATA_WIDTH-1:0] a0_data
);
`include "defines.vh"
    reg [4:0] i;
    ///< three ported regFile contains 32 registers
    reg [DATA_WIDTH-1:0] regFile [0:31];
    always @ (posedge clk) begin
        if (rst) begin
            for (i = 0; i < 31; i = i + 1)
                begin
                    regFile[i] <= 0;
                end
        end else if (we && waddr != 0) begin
            regFile[waddr] <= wdata;
        end
    end
    assign regA = (we && waddr == raddrA) ? wdata
                : (raddrA != 0) ? regFile[raddrA]
                : 0;
    assign regB = (we && waddr == raddrB) ? wdata
                : (raddrB != 0) ? regFile[raddrB]
                : 0;
    assign v0_data = regFile[`V0];
    assign a0_data = regFile[`A0];
endmodule // regFile
/**
 * @module imem
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief instruction cache memory (ROM)
 * @param DATA_WIDTH data width
 * @param BUS_WIDTH bus width
 * @param CODE_FILE MIPS assembly hexadecimal code file
 * @input addr memory address
 * @output rdata instruction read out from memory
 */
module imem
#(parameter DATA_WIDTH = 32, BUS_WIDTH = 10, CODE_FILE= "mips/benchmark.hex")
(
    input [BUS_WIDTH-1:0] addr,
    output [DATA_WIDTH-1:0] rdata
);
    reg [DATA_WIDTH-1:0] ROM [0:(2**BUS_WIDTH)-1];
    initial begin
        $readmemh(CODE_FILE, ROM, 0, (2**BUS_WIDTH)-1);
    end
    assign rdata = ROM[addr];
endmodule // imem
/**
 * @module dmem
 * @author sabertazimi
 * @email sabertazimi@gmail.com
 * @brief data cache memory (RAM)
 * @param DATA_WIDTH data width
 * @param BUS_WIDTH bus width
 * @input clk clock signal
 * @input re read enable signal
 * @input we write enable signal
 * @input addr memory address
 * @input wdata data write into memory
 * @output rdata data read out from memory
 */
module dmem
#(parameter DATA_WIDTH = 32, BUS_WIDTH = 10)
(
    input clk,
    input re,
    input we,
    input [BUS_WIDTH-1:0] addr,
    input [DATA_WIDTH-1:0] wdata,
    input [4:0] switch_addr,
    output [DATA_WIDTH-1:0] rdata,
    output [DATA_WIDTH-1:0] led_data
);
    reg [DATA_WIDTH-1:0] RAM [0:(2**BUS_WIDTH)-1];
    always @ (posedge clk) begin
        if (we) begin
            RAM[addr] <= wdata;
        end
    end
    assign rdata = re ? RAM[addr] : {(DATA_WIDTH-1){1'bx}};
    assign led_data = RAM[switch_addr];
endmodule // dmem