[Home]

Day 1: Teaching Caleb SystemVerilog Basics

I spent the first day of this challenge showing Caleb the basics of SystemVerilog. He made a module that outputs the bits for hello world over and over:

module helloworld (input clock, output data);

localparam MOST_SIG_BIT = 13*8 - 1;
localparam COUNTER_MOST_SIG_BIT = $clog2(MOST_SIG_BIT + 1) - 1;

bit [MOST_SIG_BIT : 0] hello = "Hello World!\n";
bit [COUNTER_MOST_SIG_BIT : 0] counter = MOST_SIG_BIT;

assign data = hello[counter];

always @(posedge clock) begin
    if (counter == 0) begin
        counter <= MOST_SIG_BIT;
    end
    else begin
        counter <= counter - 1;
    end
end

endmodule;

We are using Verilator; here is a simulator for the module:

#include "Vhelloworld.h"
#include "verilated.h"
#include <iostream>

using namespace std;

int main(int argc, char** argv) {
    Verilated::commandArgs(argc, argv);
    Vhelloworld* hi = new Vhelloworld;

    hi->clock = 0;

    hi->eval();

    char currentChar;
    int shift = 7;
    while (!(Verilated::gotFinish())) {
        hi->eval();

        if (!(hi->clock)) { 
            currentChar |= (hi->data) << shift;

            if (shift != 0) {
                shift -= 1;
            } else {
                cerr << currentChar;
                currentChar = 0;
                shift = 7;
            }
        }

        hi->clock = !(hi->clock);
    }

    return 0;
}

You can download these here.

Day 2: Image Filtering in SystemVerilog

Download full the solution.

So the challenge is to make a module that can take an image and a kernel and output the convolution. Here is my initial solution:

module Convolver #(
    parameter PixWidth = 8,
    parameter InWidth = 7,
    parameter InHeight = 7,
    parameter KElemWidth = 6,
    parameter KSize = 1, // 3x3, 2 is 5x5
    localparam KWidth = (2*KSize + 1),
    localparam RecWidth = PixWidth + KElemWidth + $clog2(KWidth*KWidth),
    localparam RecFracWidth = RecWidth - 2,
    localparam OutWidth = InWidth - 2*KSize,
    localparam OutHeight = InHeight - 2*KSize
) (
    input [PixWidth-1:0] image_in [0:InWidth-1] [0:InHeight-1],
    input signed [KElemWidth-1:0] kernel [0:2*KSize] [0:2*KSize],
    input signed [RecWidth-1:0] reciprocal, // Representing -3.0 to almost 3.0
    output [PixWidth-1:0] image_out [0:OutWidth-1] [0:OutHeight-1]
);

localparam MaxPixel = integer'($pow(2, PixWidth) - 1);

logic [RecWidth-1:0] abs_reciprocal;
assign abs_reciprocal = (reciprocal[RecWidth-1])
        ? (~reciprocal) + 1 : reciprocal;

genvar x, y, kx, ky;
generate
for (x = 0; x < OutWidth; x++) begin
    for (y = 0; y < OutHeight; y++) begin
        logic signed [RecWidth-1:0] kernel_multiplied [0:2*KSize] [0:2*KSize];
        for (kx = 0; kx <= 2*KSize; kx++) begin
            for (ky = 0; ky <= 2*KSize; ky++) begin
                logic signed [PixWidth:0] pixel;
                assign pixel = {1'b0,image_in[x+kx][y+ky]};
                assign kernel_multiplied[kx][ky] = pixel * kernel[kx][ky];
            end
        end

        bit [$clog2(KWidth)-1:0] i;
        bit [$clog2(KWidth)-1:0] j;
        logic signed [RecWidth-1:0] kernel_msum;
        always @* begin
            kernel_msum = 0;
            for (i = 0; i < KWidth; i++) begin
                for (j = 0; j < KWidth; j++) begin
                    kernel_msum += kernel_multiplied[i][j];
                end
            end
        end

        logic signed [RecWidth-1:0] abs_kernel_msum;
        logic [RecWidth-1:0] clamped_kernel_msum;
        assign abs_kernel_msum = (reciprocal[RecWidth-1])
                ? (~kernel_msum) + 1 : kernel_msum;
        assign clamped_kernel_msum = (abs_kernel_msum[RecWidth-1])
                ? 0 : abs_kernel_msum;

        /* verilator lint_off UNUSED */
        logic signed [2*RecWidth-1:0] quotient;
        /* verilator lint_on UNUSED */
        assign quotient = clamped_kernel_msum * abs_reciprocal;
        assign image_out[x][y] = quotient[PixWidth+RecFracWidth-1:RecFracWidth];
    end
end
endgenerate


endmodule

Now, this actually works just fine in simulation. It takes a lot of time to compile, but it does work. The issue is that for a 100x100 image and a 3x3 kernel, this module requires 100,000 multipliers! Unless you want to buy 10 Stratix 10 GX 2800s, totaling nearly a quarter of a million dollars, this really isn’t in your reach. A much better idea is to split the image up into chunks, pass each of those into the module sequentially, and then stitch the results back up into an output image. This module does that:

module ChunkedConvolver #(
    parameter PixWidth = 8,
    parameter InWidth = 100,
    parameter InHeight = 100,
    parameter KElemWidth = 6,
    parameter KSize = 1, // 3x3, 2 is 5x5
    parameter Multipliers = 250,
    parameter MultInWidth = 18,
    localparam KWidth = (2*KSize + 1),
    localparam RecWidth = PixWidth + KElemWidth + $clog2(KWidth*KWidth),
    localparam RecFracWidth = RecWidth - 2,
    localparam OutWidth = InWidth - 2*KSize,
    localparam OutHeight = InHeight - 2*KSize
) (
    input clock, reset,
    input [PixWidth-1:0] image_in [0:InWidth-1] [0:InHeight-1],
    input signed [KElemWidth-1:0] kernel [0:2*KSize] [0:2*KSize],
    input signed [RecWidth-1:0] reciprocal, // Representing -3.0 to almost 3.0
    output [PixWidth-1:0] image_out [0:OutWidth-1] [0:OutHeight-1],
    output done = 1'b0
);

localparam KernelMultWidth = (PixWidth+1 > KElemWidth) ? PixWidth+1 : KElemWidth;
localparam MultsPerKernelMult
        = $pow($ceil(real'(KernelMultWidth) / real'(MultInWidth)), 2);
localparam MultsPerRecMult
        = $pow($ceil(real'(RecWidth) / real'(MultInWidth)), 2);

// Multipliers = (KSize*KSize*MultsPerKernelMult + MultsPerRecMult)
//                  * ChunkOutWidth*ChunkOutWidth;
localparam ChunkOutWidthMax = integer'($floor($sqrt(Multipliers)
        / $sqrt(KWidth*KWidth*MultsPerKernelMult + MultsPerRecMult)));

localparam GreaterOutDim = (OutWidth > OutHeight) ? OutWidth : OutHeight;
localparam LesserOutDim = (OutWidth < OutHeight) ? OutWidth : OutHeight;

localparam ChunkOutWidthInteger
        = (ChunkOutWidthMax < LesserOutDim) ? ChunkOutWidthMax : LesserOutDim;
localparam OrdWidth = $clog2(GreaterOutDim + ChunkOutWidthInteger);
typedef logic [OrdWidth-1:0] ord_t;
localparam ChunkOutWidth = ord_t'(ChunkOutWidthInteger);

localparam ChunkInWidth = ChunkOutWidth + 2*KSize;

logic [PixWidth-1:0] chunk_in [0:ChunkInWidth-1] [0:ChunkInWidth-1];
logic [PixWidth-1:0] chunk_out [0:ChunkOutWidth-1] [0:ChunkOutWidth-1];

Convolver #(
    .PixWidth(PixWidth),
    .InWidth(ChunkInWidth),
    .InHeight(ChunkInWidth),
    .KElemWidth(KElemWidth),
    .KSize(KSize)
) conv (
    .image_in(chunk_in),
    .kernel(kernel),
    .reciprocal(reciprocal),
    .image_out(chunk_out)
);

ord_t prev_chunk_x = 0;
ord_t chunk_x = 0;
ord_t next_chunk_x;
ord_t prev_chunk_y = 0;
ord_t chunk_y = 0;
ord_t next_chunk_y;

assign next_chunk_x = chunk_x + ChunkOutWidth;
assign next_chunk_y = chunk_y + ChunkOutWidth;

logic almost_done = 1'b0;

`define OPAD {(OrdWidth - $clog2(ChunkOutWidth)){1'b0}}
`define IPAD {(OrdWidth - $clog2(ChunkInWidth)){1'b0}}

bit [$clog2(ChunkOutWidth)-1:0] ox, oy;
bit [$clog2(ChunkInWidth)-1:0] ix, iy;

always @(posedge clock) begin
    if (reset) begin
        prev_chunk_x <= 0;
        chunk_x <= 0;
        prev_chunk_y <= 0;
        chunk_y <= 0;
        almost_done <= 1'b0;
        done <= 1'b0;
    end else if (!done) begin
        for (ox = 0; {`OPAD,ox} < ChunkOutWidth; ox++) begin
            for (oy = 0; {`OPAD,oy} < ChunkOutWidth; oy++) begin
                image_out[prev_chunk_x+{`OPAD,ox}][prev_chunk_y+{`OPAD,oy}] <= chunk_out[ox][oy];
            end
        end
        prev_chunk_x <= chunk_x;
        prev_chunk_y <= chunk_y;
        for (ix = 0; {`IPAD,ix} < ChunkInWidth; ix++) begin
            for (iy = 0; {`IPAD,iy} < ChunkInWidth; iy++) begin
                chunk_in[ix][iy] <= image_in[chunk_x+{`IPAD,ix}][chunk_y+{`IPAD,iy}];
            end
        end
        if (next_chunk_x >= OutWidth) begin
            if (next_chunk_y >= OutHeight) begin
                if (almost_done) begin
                    done <= 1'b1;
                end else begin
                    almost_done <= 1'b1;
                end
            end else begin
                chunk_x <= 0;
                if (next_chunk_y > OutHeight - ChunkOutWidth) begin
                    chunk_y <= OutHeight - ChunkOutWidth;
                end else begin
                    chunk_y <= next_chunk_y;
                end
            end
        end else begin
            chunk_x <= next_chunk_x;
        end
    end
end

endmodule

Here’s a testbench for it:

#include "VChunkedConvolver.h"
#include "verilated.h"
#include <cstdio>
#include <cstring>
#include "png.h"

int main(int argc, char **argv) {
    Verilated::commandArgs(argc, argv);
    VChunkedConvolver *top = new VChunkedConvolver;

    top->clock = 0;
    top->reset = 1;

    png_image png_in;
    png_in.version = PNG_IMAGE_VERSION;
    png_in.opaque = NULL;
    png_image_begin_read_from_file(&png_in, "Vd-Lum.png");
    png_in.format = PNG_FORMAT_GRAY;
    png_image_finish_read(&png_in, NULL, top->image_in[0], 0, NULL);
    png_image_free(&png_in);

    int8_t kernel[3][3] = {
        {-1, -1, -1},
        {-1,  8, -1},
        {-1, -1, -1}
    };
    for (int x = 0; x < 3; ++x) {
        for (int y = 0; y < 3; ++y) {
            top->kernel[x][y] = *reinterpret_cast<CData*>(&kernel[x][y]);
        }
    }
    top->reciprocal = (1 << 16);

    top->eval();

    while(!(Verilated::gotFinish() || top->done)) {
        top->clock = 1;
        top->eval();
        top->clock = 0;
        top->eval();
        top->reset = 0;
    }

    png_image png_out;
    memset(&png_out, 0, sizeof(png_image));
    png_out.opaque = NULL;
    png_out.version = PNG_IMAGE_VERSION;
    png_out.width = 98;
    png_out.height = 98;
    png_out.format = PNG_FORMAT_GRAY;
    png_image_write_to_file(&png_out, "Vd-Result.png", 0,
            top->image_out[0], 0, NULL);
    png_image_free(&png_out);

    delete top;
    exit(0);
}

Results:

Input Output
Input Output

Success!

This took so long. . . . The time for sleep is now.