Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hw3 #427

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open

Hw3 #427

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,27 @@ mandelbrot(__global __read_only float *coords_real,
const int y = get_global_id(1);

float c_real, c_imag;
float z_real, z_imag;
int iter;
float z_real, z_imag, z_imag_sq, z_real_sq;
int iter, offset;

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
offset = y * w + x;
// Get complex number c
c_real = coords_real[offset];
c_imag = coords_imag[offset];
// Initialize z
z_real = 0.0;
z_imag = 0.0;
for (iter = 0; iter < max_iter; iter++) {
z_real_sq = z_real * z_real;
z_imag_sq = z_imag * z_imag;
if (z_real_sq + z_imag_sq > 4) {
break;
}
z_imag = 2 * z_real * z_imag + c_imag;
z_real = z_real_sq - z_imag_sq + c_real;
}
out_counts[offset] = iter;
}
}
85 changes: 85 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
coalesced reads, workgroups: 8, num_workers: 4, 0.13523816 seconds
coalesced reads, workgroups: 8, num_workers: 8, 0.05002128 seconds
coalesced reads, workgroups: 8, num_workers: 16, 0.02228568 seconds
coalesced reads, workgroups: 8, num_workers: 32, 0.01243688 seconds
coalesced reads, workgroups: 8, num_workers: 64, 0.00679152 seconds
coalesced reads, workgroups: 8, num_workers: 128, 0.00387344 seconds
coalesced reads, workgroups: 16, num_workers: 4, 0.05827352 seconds
coalesced reads, workgroups: 16, num_workers: 8, 0.02823304 seconds
coalesced reads, workgroups: 16, num_workers: 16, 0.01497992 seconds
coalesced reads, workgroups: 16, num_workers: 32, 0.00788968 seconds
coalesced reads, workgroups: 16, num_workers: 64, 0.00434872 seconds
coalesced reads, workgroups: 16, num_workers: 128, 0.0030288 seconds
coalesced reads, workgroups: 32, num_workers: 4, 0.02866544 seconds
coalesced reads, workgroups: 32, num_workers: 8, 0.01335272 seconds
coalesced reads, workgroups: 32, num_workers: 16, 0.007186 seconds
coalesced reads, workgroups: 32, num_workers: 32, 0.00406992 seconds
coalesced reads, workgroups: 32, num_workers: 64, 0.00277344 seconds
coalesced reads, workgroups: 32, num_workers: 128, 0.00296696 seconds
coalesced reads, workgroups: 64, num_workers: 4, 0.03042336 seconds
coalesced reads, workgroups: 64, num_workers: 8, 0.01716112 seconds
coalesced reads, workgroups: 64, num_workers: 16, 0.01011488 seconds
coalesced reads, workgroups: 64, num_workers: 32, 0.00509712 seconds
coalesced reads, workgroups: 64, num_workers: 64, 0.0039112 seconds
coalesced reads, workgroups: 64, num_workers: 128, 0.00386304 seconds
coalesced reads, workgroups: 128, num_workers: 4, 0.03748808 seconds
coalesced reads, workgroups: 128, num_workers: 8, 0.01877584 seconds
coalesced reads, workgroups: 128, num_workers: 16, 0.01011024 seconds
coalesced reads, workgroups: 128, num_workers: 32, 0.00510552 seconds
coalesced reads, workgroups: 128, num_workers: 64, 0.00361184 seconds
coalesced reads, workgroups: 128, num_workers: 128, 0.00353344 seconds
coalesced reads, workgroups: 256, num_workers: 4, 0.04078736 seconds
coalesced reads, workgroups: 256, num_workers: 8, 0.01956208 seconds
coalesced reads, workgroups: 256, num_workers: 16, 0.01072384 seconds
coalesced reads, workgroups: 256, num_workers: 32, 0.00538184 seconds
coalesced reads, workgroups: 256, num_workers: 64, 0.00410528 seconds
coalesced reads, workgroups: 256, num_workers: 128, 0.00426576 seconds
coalesced reads, workgroups: 512, num_workers: 4, 0.04820448 seconds
coalesced reads, workgroups: 512, num_workers: 8, 0.02555352 seconds
coalesced reads, workgroups: 512, num_workers: 16, 0.01377392 seconds
coalesced reads, workgroups: 512, num_workers: 32, 0.00804696 seconds
coalesced reads, workgroups: 512, num_workers: 64, 0.00533144 seconds
coalesced reads, workgroups: 512, num_workers: 128, 0.00534184 seconds
blocked reads, workgroups: 8, num_workers: 4, 0.18848704 seconds
blocked reads, workgroups: 8, num_workers: 8, 0.0736176 seconds
blocked reads, workgroups: 8, num_workers: 16, 0.04906264 seconds
blocked reads, workgroups: 8, num_workers: 32, 0.02473608 seconds
blocked reads, workgroups: 8, num_workers: 64, 0.01126728 seconds
blocked reads, workgroups: 8, num_workers: 128, 0.02127384 seconds
blocked reads, workgroups: 16, num_workers: 4, 0.06236696 seconds
blocked reads, workgroups: 16, num_workers: 8, 0.03539944 seconds
blocked reads, workgroups: 16, num_workers: 16, 0.02401944 seconds
blocked reads, workgroups: 16, num_workers: 32, 0.01006624 seconds
blocked reads, workgroups: 16, num_workers: 64, 0.01824392 seconds
blocked reads, workgroups: 16, num_workers: 128, 0.0521756 seconds
blocked reads, workgroups: 32, num_workers: 4, 0.02761504 seconds
blocked reads, workgroups: 32, num_workers: 8, 0.01609232 seconds
blocked reads, workgroups: 32, num_workers: 16, 0.01000312 seconds
blocked reads, workgroups: 32, num_workers: 32, 0.01815032 seconds
blocked reads, workgroups: 32, num_workers: 64, 0.04871368 seconds
blocked reads, workgroups: 32, num_workers: 128, 0.06726688 seconds
blocked reads, workgroups: 64, num_workers: 4, 0.02830136 seconds
blocked reads, workgroups: 64, num_workers: 8, 0.01547568 seconds
blocked reads, workgroups: 64, num_workers: 16, 0.0123616 seconds
blocked reads, workgroups: 64, num_workers: 32, 0.02625976 seconds
blocked reads, workgroups: 64, num_workers: 64, 0.08451304 seconds
blocked reads, workgroups: 64, num_workers: 128, 0.08008256 seconds
blocked reads, workgroups: 128, num_workers: 4, 0.02486112 seconds
blocked reads, workgroups: 128, num_workers: 8, 0.0149828 seconds
blocked reads, workgroups: 128, num_workers: 16, 0.01152056 seconds
blocked reads, workgroups: 128, num_workers: 32, 0.0220164 seconds
blocked reads, workgroups: 128, num_workers: 64, 0.06941248 seconds
blocked reads, workgroups: 128, num_workers: 128, 0.05325168 seconds
blocked reads, workgroups: 256, num_workers: 4, 0.02504432 seconds
blocked reads, workgroups: 256, num_workers: 8, 0.01262448 seconds
blocked reads, workgroups: 256, num_workers: 16, 0.00845736 seconds
blocked reads, workgroups: 256, num_workers: 32, 0.02512984 seconds
blocked reads, workgroups: 256, num_workers: 64, 0.05071976 seconds
blocked reads, workgroups: 256, num_workers: 128, 0.03916664 seconds
blocked reads, workgroups: 512, num_workers: 4, 0.02529368 seconds
blocked reads, workgroups: 512, num_workers: 8, 0.01537016 seconds
blocked reads, workgroups: 512, num_workers: 16, 0.0116124 seconds
blocked reads, workgroups: 512, num_workers: 32, 0.03006704 seconds
blocked reads, workgroups: 512, num_workers: 64, 0.04359768 seconds
blocked reads, workgroups: 512, num_workers: 128, 0.0251344 seconds
configuration ('coalesced', 32, 64): 0.00277344 seconds <-- *** BEST ***
163 changes: 91 additions & 72 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
@@ -1,72 +1,91 @@
__kernel void sum_coalesced(__global float* x,
__global float* partial,
__local float* fast,
long N)
{
float sum = 0;
size_t local_id = get_local_id(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

// binary reduction
//
// thread i should sum fast[i] and fast[i + offset] and store back
// in fast[i], for offset = (local_size >> j) for j from 1 to
// log_2(local_size)
//
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}

__kernel void sum_blocked(__global float* x,
__global float* partial,
__local float* fast,
long N)
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil((float)N / get_global_size(0));

// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
// thread with global_id 2 should add 2k..3k-1
// ...
// with k = ceil(N / get_global_size()).
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

// binary reduction
//
// thread i should sum fast[i] and fast[i + offset] and store back
// in fast[i], for offset = (local_size >> j) for j from 1 to
// log_2(local_size)
//
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}
__kernel void sum_coalesced(__global float* x,
__global float* partial,
__local float* fast,
long N)
{
float sum = 0;
int offset;
uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
int global_size = get_global_size(0);
uint local_size = get_local_size(0);
size_t local_id = get_local_id(0);
int i = get_global_id(0);
// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
// get thread id
for (offset = i; offset < N; offset += global_size) {
sum += x[offset];
}
fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
// binary reduction
//
// thread i should sum fast[i] and fast[i + offset] and store back
// in fast[i], for offset = (local_size >> j) for j from 1 to
// log_2(local_size)
//
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (j = local_size >> 1; j > 0; j >>= 1) {
// only make sure j > local_id, so that we store the new sum in the position given by the
// lesser of the two indexes
if (local_id < j) {
fast[local_id] += fast[local_id + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}

__kernel void sum_blocked(__global float* x,
__global float* partial,
__local float* fast,
long N)
{
float sum = 0;
size_t local_id = get_local_id(0);
int k = ceil((float)N / get_global_size(0));
int offset;
uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
uint local_size = get_local_size(0);
int i = get_global_id(0);
// thread with global_id 0 should add 0..k-1
// thread with global_id 1 should add k..2k-1
// thread with global_id 2 should add 2k..3k-1
// ...
// with k = ceil(N / get_global_size()).
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
offset = k * i;
int max_offset = k * i + k; // last offset thread i should read from
while ((offset < max_offset) && (offset < N)) {
sum += x[offset];
offset++;
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

// binary reduction
//
// thread i should sum fast[i] and fast[i + offset] and store back
// in fast[i], for offset = (local_size >> j) for j from 1 to
// log_2(local_size)
//
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (j = local_size >> 1; j > 0; j >>= 1) {
// only make sure j > local_id, so that we store the new sum in the position given by the
// lesser of the two indexes
if (local_id < j) {
fast[local_id] += fast[local_id + j];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}
2 changes: 2 additions & 0 deletions HW3/P3/tune.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
import pyopencl as cl
import numpy as np

Expand Down
75 changes: 75 additions & 0 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
@@ -1,5 +1,30 @@
#include "median9.h"

// ADAPTED FROM HW3 P5 label_regions.cl
float get_clamped_value(__global __read_only float *in_values,int w, int h, int x, int y) {
// if x or y are globally out of bounds, return the coordinates of the closest valid pixel
int corrected_x;
int corrected_y;
//get correct x coord
if (x < 0) {
corrected_x = 0;
} else if (x >= w) {
corrected_x = w - 1;
} else {
corrected_x = x;
}
//get correct y coord
if (y < 0) {
corrected_y = 0;
} else if (y >= h) {
corrected_y = h - 1;
} else {
corrected_y = y;
}
// return pixel value
return in_values[corrected_y * w + corrected_x];
}

// 3x3 median filter
__kernel void
median_3x3(__global __read_only float *in_values,
Expand Down Expand Up @@ -31,4 +56,54 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.

//ADAPTED FROM https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl

// Global position of output pixel
const int x = get_global_id(0);
const int y = get_global_id(1);

// Local position relative to (0, 0) in workgroup
const int lx = get_local_id(0);
const int ly = get_local_id(1);

// coordinates of the upper left corner of the buffer in image
// space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// coordinates of our pixel in the local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int row;
// Read pixel values and store in buffer
if (idx_1D < buf_w)
for (row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
get_clamped_value(in_values,
w, h,
buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// write output
if ((y < h) && (x < w)) // stay in bounds
// Calculate 3x3 median
out_values[y * w + x] = \
median9(buffer[(buf_y - 1) * buf_w + buf_x - 1],
buffer[(buf_y - 1) * buf_w + buf_x],
buffer[(buf_y - 1) * buf_w + buf_x + 1],
buffer[buf_y * buf_w + buf_x - 1],
buffer[buf_y * buf_w + buf_x],
buffer[buf_y * buf_w + buf_x + 1],
buffer[(buf_y + 1) * buf_w + buf_x - 1],
buffer[(buf_y + 1) * buf_w + buf_x],
buffer[(buf_y + 1) * buf_w + buf_x + 1]);

}
Loading