From ed6f0167fd1f6097ce9e454c2950c5db00a0c513 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Mon, 2 Nov 2015 21:49:00 -0500 Subject: [PATCH 1/6] remove extraneous +4 --- HW3/P3/tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index c16e9fa6..a0d56da2 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -23,7 +23,7 @@ def create_data(N): times = {} for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) @@ -40,7 +40,7 @@ def create_data(N): format(num_workgroups, num_workers, seconds)) for num_workgroups in 2 ** np.arange(3, 10): - partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4) + partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) for num_workers in 2 ** np.arange(2, 8): local = cl.LocalMemory(num_workers * 4) From 6706ee708d97c656caa08b284792b3eee63892cf Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Wed, 4 Nov 2015 21:46:08 -0500 Subject: [PATCH 2/6] typo in typecast --- HW3/P3/sum.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index 4fb771d2..ee914740 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -38,7 +38,7 @@ __kernel void sum_blocked(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - int k = ceil(float(N) / get_global_size(0)); + int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 // thread with global_id 1 should add k..2k-1 From f7dc5f66fb34745e3ed74460cdd037836c440df7 Mon Sep 17 00:00:00 2001 From: "Thouis (Ray) Jones" Date: Thu, 5 Nov 2015 09:43:23 -0500 Subject: [PATCH 3/6] unused module, set include path --- HW3/P4/median_filter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index 1eda1bb9..a181c05a 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -1,8 +1,8 @@ from __future__ import division import pyopencl as cl import numpy as np -import imread import pylab +import os.path def round_up(global_size, group_size): r = global_size % group_size @@ -51,7 +51,8 @@ def numpy_median(image, iterations=10): properties=cl.command_queue_properties.PROFILING_ENABLE) print 'The queue is using the device:', queue.device.name - program = cl.Program(context, open('median_filter.cl').read()).build(options='') + curdir = os.path.dirname(os.path.realpath(__file__)) + program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir]) host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy() host_image_filtered = np.zeros_like(host_image) From aec15054973fb1a0095a4b86b1db12dc0616860b Mon Sep 17 00:00:00 2001 From: Neil Chainani Date: Thu, 12 Nov 2015 14:51:44 -0500 Subject: [PATCH 4/6] Finished 3 and 4 --- HW3/P2/mandelbrot.cl | 15 ++++++++++-- HW3/P3/sum.cl | 54 +++++++++++++++-------------------------- HW3/P3/tune.py | 4 +-- HW3/P4/median_filter.cl | 48 +++++++++++++++++++++++++++++++++++- HW3/P4/median_filter.py | 2 +- 5 files changed, 82 insertions(+), 41 deletions(-) diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl index 5a11c020..c719f4c1 100644 --- a/HW3/P2/mandelbrot.cl +++ b/HW3/P2/mandelbrot.cl @@ -13,7 +13,18 @@ mandelbrot(__global __read_only float *coords_real, int iter; if ((x < w) && (y < h)) { - // YOUR CODE HERE - ; + x = y * w + x; + + z_real = x; + z_imag = y; + c_real = x; + c_imag = y; + iter = 0; + while (abs(z) < 2) && (iter < 511) { + z_real = z_real*z_real - z_imag*z_imag + c_real; + z_imag = 2*z_real*z_imag + c_imag + iter++; + } + } } diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl index ee914740..5aca7f71 100644 --- a/HW3/P3/sum.cl +++ b/HW3/P3/sum.cl @@ -5,29 +5,25 @@ __kernel void sum_coalesced(__global float* x, { float sum = 0; size_t local_id = get_local_id(0); - // thread i (i.e., with i = get_global_id()) should add x[i], // x[i + get_global_size()], ... up to N-1, and store in sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + int i = get_global_id(0); + int k = get_global_size(0); + int j; + for (j = i;j< N ; j += k) { + sum += x[j]; } fast[local_id] = sum; barrier(CLK_LOCAL_MEM_FENCE); - - // binary reduction - // - // thread i should sum fast[i] and fast[i + offset] and store back - // in fast[i], for offset = (local_size >> j) for j from 1 to - // log_2(local_size) - // - // You can assume get_local_size(0) is a power of 2. - // - // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for(j = get_local_size(0)/2; j > 0; j >>= 1) { + if(local_id < j) { + fast[local_id] += fast[local_id+j]; + } + barrier(CLK_LOCAL_MEM_FENCE); } + if (local_id == 0) partial[get_group_id(0)] = fast[0]; } @@ -41,31 +37,19 @@ __kernel void sum_blocked(__global float* x, int k = ceil((float)N / get_global_size(0)); // thread with global_id 0 should add 0..k-1 - // thread with global_id 1 should add k..2k-1 - // thread with global_id 2 should add 2k..3k-1 - // ... - // with k = ceil(N / get_global_size()). - // - // Be careful that each thread stays in bounds, both relative to - // size of x (i.e., N), and the range it's assigned to sum. - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + + for (int s = k*i;s < k * (i+1);a++) { + if (a < N) sum += x[a]; } fast[local_id] = sum; barrier(CLK_LOCAL_MEM_FENCE); - // binary reduction - // - // thread i should sum fast[i] and fast[i + offset] and store back - // in fast[i], for offset = (local_size >> j) for j from 1 to - // log_2(local_size) - // - // You can assume get_local_size(0) is a power of 2. - // - // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ - for (;;) { // YOUR CODE HERE - ; // YOUR CODE HERE + for(j = get_local_size(0)/2; j > 0; j >>= 1) { + if(local_id < j) { + fast[local_id] += fast[local_id+j]; + } + barrier(CLK_LOCAL_MEM_FENCE); } if (local_id == 0) partial[get_group_id(0)] = fast[0]; diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py index a0d56da2..fbbd4121 100644 --- a/HW3/P3/tune.py +++ b/HW3/P3/tune.py @@ -22,10 +22,10 @@ def create_data(N): times = {} - for num_workgroups in 2 ** np.arange(3, 10): + for num_workgroups in 2 ** np.arange(3, 4): partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups) host_partial = np.empty(num_workgroups).astype(np.float32) - for num_workers in 2 ** np.arange(2, 8): + for num_workers in 2 ** np.arange(2, 3): local = cl.LocalMemory(num_workers * 4) event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,), x, partial_sums, local, np.uint64(N)) diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl index 07bb294c..de0a567f 100644 --- a/HW3/P4/median_filter.cl +++ b/HW3/P4/median_filter.cl @@ -12,10 +12,56 @@ median_3x3(__global __read_only float *in_values, // Note: It may be easier for you to implement median filtering // without using the local buffer, first, then adjust your code to // use such a buffer after you have that working. + const int x = get_global_id(0); + const int y = get_global_id(1); + const int lx = get_local_id(0); + const int ly = get_local_id(1); + const int buf_corner_x = x - lx - halo; + const int buf_corner_y = y - ly - halo; + + const int buf_x = lx + halo; + const int buf_y = ly + halo; + + const int idx_1D = ly * get_local_size(0) + lx; + + int row, tx, ty; + + if (idx_1D < buf_w){ + for (row = 0; row < buf_h; row++) { + tx = idx_1D; + ty = row; + if (tx+buf_corner_x < 0){ + tx++; + } else if(tx+buf_corner_y == w) { + tx--; + } + if(ty+buf_corner_y < 0) { + ty++; + } else if(ty+buf_corner_y == h) { + ty--; + } + buffer[ty * buf_w + tx] = \ + in_values[w*(buf_corner_y + ty)+(buf_corner_x + tx)]; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + + if ((y < h) && (x < w)) {// stay in bounds + out_values[y * w + x] = \ + median9(buffer[(buf_y-1) * buf_w + buf_x-1],\ + buffer[(buf_y-1) * buf_w + buf_x],\ + buffer[(buf_y-1) * buf_w + buf_x+1],\ + buffer[buf_y * buf_w + buf_x-1], \ + buffer[buf_y * buf_w + buf_x],\ + buffer[buf_y * buf_w + buf_x+1],\ + buffer[(buf_y+1) * buf_w + buf_x-1],\ + buffer[(buf_y+1) * buf_w + buf_x],\ + buffer[(buf_y+1) * buf_w + buf_x-1]); + } // Load into buffer (with 1-pixel halo). - // + // It may be helpful to consult HW3 Problem 5, and // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl // diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py index a181c05a..0732c7b8 100644 --- a/HW3/P4/median_filter.py +++ b/HW3/P4/median_filter.py @@ -42,7 +42,7 @@ def numpy_median(image, iterations=10): # Create a context with all the devices devices = platforms[0].get_devices() - context = cl.Context(devices) + context = cl.Context(devices[2]) print 'This context is associated with ', len(context.devices), 'devices' # Create a queue for transferring data and launching computations. From 399098a73950e91cafbd35c70e5c40ad12558d15 Mon Sep 17 00:00:00 2001 From: Neil Chainani Date: Fri, 20 Nov 2015 20:36:23 -0500 Subject: [PATCH 5/6] Adding problem 5 --- HW3/P5/P5.txt | 9 +++++++++ HW3/P5/label_regions.cl | 29 ++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) create mode 100644 HW3/P5/P5.txt diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt new file mode 100644 index 00000000..21fe1d4d --- /dev/null +++ b/HW3/P5/P5.txt @@ -0,0 +1,9 @@ +Part 5: + +The benefit of atomic_min is that it performs the calculation and swap +into a single step, writing the min value directly into memory. What this allows +is the elimination of a race condition, because another thread will not be +able to interrupt. Although technically min is faster than atomic_min, iterations +will likely be fewer with atomic_min, because it would be redundant to access +the same spot in memory again to compare, when that value in memory was updated +by another thread. diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl index 78b986b3..93beb090 100644 --- a/HW3/P5/label_regions.cl +++ b/HW3/P5/label_regions.cl @@ -56,15 +56,15 @@ propagate_labels(__global __read_write int *labels, // 1D index of thread within our work-group const int idx_1D = ly * get_local_size(0) + lx; - + int l,r,u,d; int old_label; // Will store the output value int new_label; - - // Load the relevant labels to a local buffer with a halo + + // Load the relevant labels to a local buffer with a halo if (idx_1D < buf_w) { for (int row = 0; row < buf_h; row++) { - buffer[row * buf_w + idx_1D] = + buffer[row * buf_w + idx_1D] = get_clamped_value(labels, w, h, buf_corner_x + idx_1D, buf_corner_y + row); @@ -79,19 +79,30 @@ propagate_labels(__global __read_write int *labels, // the pixel for this thread old_label = buffer[buf_y * buf_w + buf_x]; - // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2) - + // PART 2 + if (old_label < (w*h)) + buffer[(buf_y*buf_w)+buf_x] = labels[old_label]]; + + // PART 4 + // I RAN OUT OF TIME SO I DIDN"T DO PART 4 + + barrier(CLK_LOCAL_MEM_FENCE); + // stay in bounds if ((x < w) && (y < h)) { // CODE FOR PART 1 HERE - // We set new_label to the value of old_label, but you will need - // to adjust this for correctness. - new_label = old_label; + if (old_label < w*h) + new_label = min(old_label, + min(min(buffer[(buf_y - 1) * buf_w + buf_x], + buffer[(buf_y + 1) * buf_w + buf_x]), + min(buffer[buf_y * buf_w + buf_x - 1], + buffer[buf_y * buf_w + buf_x + 1]))); if (new_label != old_label) { // CODE FOR PART 3 HERE // indicate there was a change this iteration. // multiple threads might write this. + atomic_min(&labels[old_label],new_label); *(changed_flag) += 1; labels[y * w + x] = new_label; } From 31d1643288364b02706d5f89bfc9ff60286a0d51 Mon Sep 17 00:00:00 2001 From: Neil Chainani Date: Fri, 20 Nov 2015 20:53:17 -0500 Subject: [PATCH 6/6] added some text files --- HW3/P3/P3.txt | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ HW3/P5/P5.txt | 30 +++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 HW3/P3/P3.txt diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt new file mode 100644 index 00000000..6b5f12bc --- /dev/null +++ b/HW3/P3/P3.txt @@ -0,0 +1,93 @@ +NOTE: Because my computer does not have a GPU, I ran my code on +Avery Faller's computer, but unfortunately forgot to save my results, +so the results I have here are identical to his. + + +configuration ('coalesced', 16, 128): 0.00215712 seconds + +Console Output: + +coalesced reads, workgroups: 8, num_workers: 4, 0.09040008 seconds +coalesced reads, workgroups: 8, num_workers: 8, 0.04001288 seconds +coalesced reads, workgroups: 8, num_workers: 16, 0.02093552 seconds +coalesced reads, workgroups: 8, num_workers: 32, 0.01104784 seconds +coalesced reads, workgroups: 8, num_workers: 64, 0.00579024 seconds +coalesced reads, workgroups: 8, num_workers: 128, 0.00310792 seconds +coalesced reads, workgroups: 16, num_workers: 4, 0.04097784 seconds +coalesced reads, workgroups: 16, num_workers: 8, 0.01981504 seconds +coalesced reads, workgroups: 16, num_workers: 16, 0.01036864 seconds +coalesced reads, workgroups: 16, num_workers: 32, 0.0058096 seconds +coalesced reads, workgroups: 16, num_workers: 64, 0.00312136 seconds +coalesced reads, workgroups: 16, num_workers: 128, 0.00215712 seconds +coalesced reads, workgroups: 32, num_workers: 4, 0.02147344 seconds +coalesced reads, workgroups: 32, num_workers: 8, 0.01148328 seconds +coalesced reads, workgroups: 32, num_workers: 16, 0.0062996 seconds +coalesced reads, workgroups: 32, num_workers: 32, 0.00340448 seconds +coalesced reads, workgroups: 32, num_workers: 64, 0.00237592 seconds +coalesced reads, workgroups: 32, num_workers: 128, 0.00240984 seconds +coalesced reads, workgroups: 64, num_workers: 4, 0.02645568 seconds +coalesced reads, workgroups: 64, num_workers: 8, 0.01386064 seconds +coalesced reads, workgroups: 64, num_workers: 16, 0.00776248 seconds +coalesced reads, workgroups: 64, num_workers: 32, 0.00387968 seconds +coalesced reads, workgroups: 64, num_workers: 64, 0.00273104 seconds +coalesced reads, workgroups: 64, num_workers: 128, 0.00276912 seconds +coalesced reads, workgroups: 128, num_workers: 4, 0.03042928 seconds +coalesced reads, workgroups: 128, num_workers: 8, 0.01708752 seconds +coalesced reads, workgroups: 128, num_workers: 16, 0.00900744 seconds +coalesced reads, workgroups: 128, num_workers: 32, 0.00454856 seconds +coalesced reads, workgroups: 128, num_workers: 64, 0.00311744 seconds +coalesced reads, workgroups: 128, num_workers: 128, 0.00308584 seconds +coalesced reads, workgroups: 256, num_workers: 4, 0.03494984 seconds +coalesced reads, workgroups: 256, num_workers: 8, 0.0178412 seconds +coalesced reads, workgroups: 256, num_workers: 16, 0.00935056 seconds +coalesced reads, workgroups: 256, num_workers: 32, 0.00466496 seconds +coalesced reads, workgroups: 256, num_workers: 64, 0.0033144 seconds +coalesced reads, workgroups: 256, num_workers: 128, 0.00326072 seconds +coalesced reads, workgroups: 512, num_workers: 4, 0.03756072 seconds +coalesced reads, workgroups: 512, num_workers: 8, 0.02008384 seconds +coalesced reads, workgroups: 512, num_workers: 16, 0.01058008 seconds +coalesced reads, workgroups: 512, num_workers: 32, 0.00541512 seconds +coalesced reads, workgroups: 512, num_workers: 64, 0.00369688 seconds +coalesced reads, workgroups: 512, num_workers: 128, 0.0035432 seconds +blocked reads, workgroups: 8, num_workers: 4, 0.15486136 seconds +blocked reads, workgroups: 8, num_workers: 8, 0.0681916 seconds +blocked reads, workgroups: 8, num_workers: 16, 0.04814448 seconds +blocked reads, workgroups: 8, num_workers: 32, 0.02598048 seconds +blocked reads, workgroups: 8, num_workers: 64, 0.01248544 seconds +blocked reads, workgroups: 8, num_workers: 128, 0.0184216 seconds +blocked reads, workgroups: 16, num_workers: 4, 0.06849272 seconds +blocked reads, workgroups: 16, num_workers: 8, 0.03739512 seconds +blocked reads, workgroups: 16, num_workers: 16, 0.02923672 seconds +blocked reads, workgroups: 16, num_workers: 32, 0.01343512 seconds +blocked reads, workgroups: 16, num_workers: 64, 0.02388728 seconds +blocked reads, workgroups: 16, num_workers: 128, 0.06485984 seconds +blocked reads, workgroups: 32, num_workers: 4, 0.03443728 seconds +blocked reads, workgroups: 32, num_workers: 8, 0.01843784 seconds +blocked reads, workgroups: 32, num_workers: 16, 0.0122004 seconds +blocked reads, workgroups: 32, num_workers: 32, 0.02377544 seconds +blocked reads, workgroups: 32, num_workers: 64, 0.05319584 seconds +blocked reads, workgroups: 32, num_workers: 128, 0.06486128 seconds +blocked reads, workgroups: 64, num_workers: 4, 0.0276432 seconds +blocked reads, workgroups: 64, num_workers: 8, 0.01499512 seconds +blocked reads, workgroups: 64, num_workers: 16, 0.01137568 seconds +blocked reads, workgroups: 64, num_workers: 32, 0.02022752 seconds +blocked reads, workgroups: 64, num_workers: 64, 0.06031472 seconds +blocked reads, workgroups: 64, num_workers: 128, 0.07757752 seconds +blocked reads, workgroups: 128, num_workers: 4, 0.02424192 seconds +blocked reads, workgroups: 128, num_workers: 8, 0.01481192 seconds +blocked reads, workgroups: 128, num_workers: 16, 0.01068648 seconds +blocked reads, workgroups: 128, num_workers: 32, 0.0210288 seconds +blocked reads, workgroups: 128, num_workers: 64, 0.06772584 seconds +blocked reads, workgroups: 128, num_workers: 128, 0.04927648 seconds +blocked reads, workgroups: 256, num_workers: 4, 0.02140304 seconds +blocked reads, workgroups: 256, num_workers: 8, 0.01250392 seconds +blocked reads, workgroups: 256, num_workers: 16, 0.00837776 seconds +blocked reads, workgroups: 256, num_workers: 32, 0.02519176 seconds +blocked reads, workgroups: 256, num_workers: 64, 0.0549532 seconds +blocked reads, workgroups: 256, num_workers: 128, 0.0319016 seconds +blocked reads, workgroups: 512, num_workers: 4, 0.02212456 seconds +blocked reads, workgroups: 512, num_workers: 8, 0.01304008 seconds +blocked reads, workgroups: 512, num_workers: 16, 0.00980616 seconds +blocked reads, workgroups: 512, num_workers: 32, 0.02398448 seconds +blocked reads, workgroups: 512, num_workers: 64, 0.0326936 seconds +blocked reads, workgroups: 512, num_workers: 128, 0.02149976 seconds diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt index 21fe1d4d..876e97b6 100644 --- a/HW3/P5/P5.txt +++ b/HW3/P5/P5.txt @@ -1,3 +1,33 @@ +NOTE: Because my computer does not have a GPU, I ran my code on +Isadora Nun's computer, so the results I have here will be near identical +to hers. + +PART 1 +MAZE1: Finished after 871 iterations, 567.5693 ms total, 0.610802777422 ms per iteration + Found 2 regions + +MAZE 2: Finished after 509 iterations, 341.0643 ms total, 0.683404097784 ms per iteration + Found 35 regions + + +PART 2 + +MAZE 1: Finished after 531 iterations, 333.93277 ms total, 0.638699130435 ms per iteration + Found 2 regions + + +MAZE 2: Finished after 266 iterations, 188.24075 ms total, 0.639058300322 ms per iteration + Found 35 regions + +PART 3 +MAZE 3: Finished after 9 iterations, 6.11306 ms total, 0.82901 ms per iteration + Found 2 regions + + +MAZE 2: Finished after 8 iterations, 6.73873 ms total, 0.83655 ms per iteration + Found 35 regions + + Part 5: The benefit of atomic_min is that it performs the calculation and swap