[Back to the 30 Days of Programming Overview Page]
Day 1: Teaching Caleb SystemVerilog Basics
I spent the first day of this challenge showing Caleb the basics of SystemVerilog. He made a module that outputs the bits for hello world over and over:
module helloworld (input clock, output data);
localparam MOST_SIG_BIT = 13*8 - 1;
localparam COUNTER_MOST_SIG_BIT = $clog2(MOST_SIG_BIT + 1) - 1;
bit [MOST_SIG_BIT : 0] hello = "Hello World!\n";
bit [COUNTER_MOST_SIG_BIT : 0] counter = MOST_SIG_BIT;
assign data = hello[counter];
always @(posedge clock) begin
if (counter == 0) begin
counter <= MOST_SIG_BIT;
end
else begin
counter <= counter - 1;
end
end
endmodule;
We are using Verilator; here is a simulator for the module:
#include "Vhelloworld.h"
#include "verilated.h"
#include <iostream>
using namespace std;
int main(int argc, char** argv) {
Verilated::commandArgs(argc, argv);
Vhelloworld* hi = new Vhelloworld;
hi->clock = 0;
hi->eval();
char currentChar;
int shift = 7;
while (!(Verilated::gotFinish())) {
hi->eval();
if (!(hi->clock)) {
currentChar |= (hi->data) << shift;
if (shift != 0) {
shift -= 1;
} else {
cerr << currentChar;
currentChar = 0;
shift = 7;
}
}
hi->clock = !(hi->clock);
}
return 0;
}
You can download these here.
Day 2: Image Filtering in SystemVerilog
So the challenge is to make a module that can take an image and a kernel and output the convolution. Here is my initial solution:
module Convolver #(
parameter PixWidth = 8,
parameter InWidth = 7,
parameter InHeight = 7,
parameter KElemWidth = 6,
parameter KSize = 1, // 3x3, 2 is 5x5
localparam KWidth = (2*KSize + 1),
localparam RecWidth = PixWidth + KElemWidth + $clog2(KWidth*KWidth),
localparam RecFracWidth = RecWidth - 2,
localparam OutWidth = InWidth - 2*KSize,
localparam OutHeight = InHeight - 2*KSize
) (
input [PixWidth-1:0] image_in [0:InWidth-1] [0:InHeight-1],
input signed [KElemWidth-1:0] kernel [0:2*KSize] [0:2*KSize],
input signed [RecWidth-1:0] reciprocal, // Representing -3.0 to almost 3.0
output [PixWidth-1:0] image_out [0:OutWidth-1] [0:OutHeight-1]
);
localparam MaxPixel = integer'($pow(2, PixWidth) - 1);
logic [RecWidth-1:0] abs_reciprocal;
assign abs_reciprocal = (reciprocal[RecWidth-1])
? (~reciprocal) + 1 : reciprocal;
genvar x, y, kx, ky;
generate
for (x = 0; x < OutWidth; x++) begin
for (y = 0; y < OutHeight; y++) begin
logic signed [RecWidth-1:0] kernel_multiplied [0:2*KSize] [0:2*KSize];
for (kx = 0; kx <= 2*KSize; kx++) begin
for (ky = 0; ky <= 2*KSize; ky++) begin
logic signed [PixWidth:0] pixel;
assign pixel = {1'b0,image_in[x+kx][y+ky]};
assign kernel_multiplied[kx][ky] = pixel * kernel[kx][ky];
end
end
bit [$clog2(KWidth)-1:0] i;
bit [$clog2(KWidth)-1:0] j;
logic signed [RecWidth-1:0] kernel_msum;
always @* begin
kernel_msum = 0;
for (i = 0; i < KWidth; i++) begin
for (j = 0; j < KWidth; j++) begin
kernel_msum += kernel_multiplied[i][j];
end
end
end
logic signed [RecWidth-1:0] abs_kernel_msum;
logic [RecWidth-1:0] clamped_kernel_msum;
assign abs_kernel_msum = (reciprocal[RecWidth-1])
? (~kernel_msum) + 1 : kernel_msum;
assign clamped_kernel_msum = (abs_kernel_msum[RecWidth-1])
? 0 : abs_kernel_msum;
/* verilator lint_off UNUSED */
logic signed [2*RecWidth-1:0] quotient;
/* verilator lint_on UNUSED */
assign quotient = clamped_kernel_msum * abs_reciprocal;
assign image_out[x][y] = quotient[PixWidth+RecFracWidth-1:RecFracWidth];
end
end
endgenerate
endmodule
Now, this actually works just fine in simulation. It takes a lot of time to compile, but it does work. The issue is that for a 100x100 image and a 3x3 kernel, this module requires 100,000 multipliers! Unless you want to buy 10 Stratix 10 GX 2800s, totaling nearly a quarter of a million dollars, this really isn't in your reach. A much better idea is to split the image up into chunks, pass each of those into the module sequentially, and then stitch the results back up into an output image. This module does that:
module ChunkedConvolver #(
parameter PixWidth = 8,
parameter InWidth = 100,
parameter InHeight = 100,
parameter KElemWidth = 6,
parameter KSize = 1, // 3x3, 2 is 5x5
parameter Multipliers = 250,
parameter MultInWidth = 18,
localparam KWidth = (2*KSize + 1),
localparam RecWidth = PixWidth + KElemWidth + $clog2(KWidth*KWidth),
localparam RecFracWidth = RecWidth - 2,
localparam OutWidth = InWidth - 2*KSize,
localparam OutHeight = InHeight - 2*KSize
) (
input clock, reset,
input [PixWidth-1:0] image_in [0:InWidth-1] [0:InHeight-1],
input signed [KElemWidth-1:0] kernel [0:2*KSize] [0:2*KSize],
input signed [RecWidth-1:0] reciprocal, // Representing -3.0 to almost 3.0
output [PixWidth-1:0] image_out [0:OutWidth-1] [0:OutHeight-1],
output done = 1'b0
);
localparam KernelMultWidth = (PixWidth+1 > KElemWidth) ? PixWidth+1 : KElemWidth;
localparam MultsPerKernelMult
= $pow($ceil(real'(KernelMultWidth) / real'(MultInWidth)), 2);
localparam MultsPerRecMult
= $pow($ceil(real'(RecWidth) / real'(MultInWidth)), 2);
// Multipliers = (KSize*KSize*MultsPerKernelMult + MultsPerRecMult)
// * ChunkOutWidth*ChunkOutWidth;
localparam ChunkOutWidthMax = integer'($floor($sqrt(Multipliers)
/ $sqrt(KWidth*KWidth*MultsPerKernelMult + MultsPerRecMult)));
localparam GreaterOutDim = (OutWidth > OutHeight) ? OutWidth : OutHeight;
localparam LesserOutDim = (OutWidth < OutHeight) ? OutWidth : OutHeight;
localparam ChunkOutWidthInteger
= (ChunkOutWidthMax < LesserOutDim) ? ChunkOutWidthMax : LesserOutDim;
localparam OrdWidth = $clog2(GreaterOutDim + ChunkOutWidthInteger);
typedef logic [OrdWidth-1:0] ord_t;
localparam ChunkOutWidth = ord_t'(ChunkOutWidthInteger);
localparam ChunkInWidth = ChunkOutWidth + 2*KSize;
logic [PixWidth-1:0] chunk_in [0:ChunkInWidth-1] [0:ChunkInWidth-1];
logic [PixWidth-1:0] chunk_out [0:ChunkOutWidth-1] [0:ChunkOutWidth-1];
Convolver #(
.PixWidth(PixWidth),
.InWidth(ChunkInWidth),
.InHeight(ChunkInWidth),
.KElemWidth(KElemWidth),
.KSize(KSize)
) conv (
.image_in(chunk_in),
.kernel(kernel),
.reciprocal(reciprocal),
.image_out(chunk_out)
);
ord_t prev_chunk_x = 0;
ord_t chunk_x = 0;
ord_t next_chunk_x;
ord_t prev_chunk_y = 0;
ord_t chunk_y = 0;
ord_t next_chunk_y;
assign next_chunk_x = chunk_x + ChunkOutWidth;
assign next_chunk_y = chunk_y + ChunkOutWidth;
logic almost_done = 1'b0;
`define OPAD {(OrdWidth - $clog2(ChunkOutWidth)){1'b0}}
`define IPAD {(OrdWidth - $clog2(ChunkInWidth)){1'b0}}
bit [$clog2(ChunkOutWidth)-1:0] ox, oy;
bit [$clog2(ChunkInWidth)-1:0] ix, iy;
always @(posedge clock) begin
if (reset) begin
prev_chunk_x <= 0;
chunk_x <= 0;
prev_chunk_y <= 0;
chunk_y <= 0;
almost_done <= 1'b0;
done <= 1'b0;
end else if (!done) begin
for (ox = 0; {`OPAD,ox} < ChunkOutWidth; ox++) begin
for (oy = 0; {`OPAD,oy} < ChunkOutWidth; oy++) begin
image_out[prev_chunk_x+{`OPAD,ox}][prev_chunk_y+{`OPAD,oy}] <= chunk_out[ox][oy];
end
end
prev_chunk_x <= chunk_x;
prev_chunk_y <= chunk_y;
for (ix = 0; {`IPAD,ix} < ChunkInWidth; ix++) begin
for (iy = 0; {`IPAD,iy} < ChunkInWidth; iy++) begin
chunk_in[ix][iy] <= image_in[chunk_x+{`IPAD,ix}][chunk_y+{`IPAD,iy}];
end
end
if (next_chunk_x >= OutWidth) begin
if (next_chunk_y >= OutHeight) begin
if (almost_done) begin
done <= 1'b1;
end else begin
almost_done <= 1'b1;
end
end else begin
chunk_x <= 0;
if (next_chunk_y > OutHeight - ChunkOutWidth) begin
chunk_y <= OutHeight - ChunkOutWidth;
end else begin
chunk_y <= next_chunk_y;
end
end
end else begin
chunk_x <= next_chunk_x;
end
end
end
endmodule
Here's a testbench for it:
#include "VChunkedConvolver.h"
#include "verilated.h"
#include <cstdio>
#include <cstring>
#include "png.h"
int main(int argc, char **argv) {
Verilated::commandArgs(argc, argv);
VChunkedConvolver *top = new VChunkedConvolver;
top->clock = 0;
top->reset = 1;
png_image png_in;
png_in.version = PNG_IMAGE_VERSION;
png_in.opaque = NULL;
png_image_begin_read_from_file(&png_in, "Vd-Lum.png");
png_in.format = PNG_FORMAT_GRAY;
png_image_finish_read(&png_in, NULL, top->image_in[0], 0, NULL);
png_image_free(&png_in);
int8_t kernel[3][3] = {
{-1, -1, -1},
{-1, 8, -1},
{-1, -1, -1}
};
for (int x = 0; x < 3; ++x) {
for (int y = 0; y < 3; ++y) {
top->kernel[x][y] = *reinterpret_cast<CData*>(&kernel[x][y]);
}
}
top->reciprocal = (1 << 16);
top->eval();
while(!(Verilated::gotFinish() || top->done)) {
top->clock = 1;
top->eval();
top->clock = 0;
top->eval();
top->reset = 0;
}
png_image png_out;
memset(&png_out, 0, sizeof(png_image));
png_out.opaque = NULL;
png_out.version = PNG_IMAGE_VERSION;
png_out.width = 98;
png_out.height = 98;
png_out.format = PNG_FORMAT_GRAY;
png_image_write_to_file(&png_out, "Vd-Result.png", 0,
top->image_out[0], 0, NULL);
png_image_free(&png_out);
delete top;
exit(0);
}
Results:
Input | Output |
---|---|
![]() |
![]() |
Success!
This took so long. . . . The time for sleep is now.