harvard-cs205 · vivjay30 · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 19, 2015
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,87 @@
+The best configuration was of course coalesced reads, and workgroups 256, num workers 64 at .0028
+
+coalesced reads, workgroups: 8, num_workers: 4, 0.17128184 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.0681632 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.05655496 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.02897184 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.0144704 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.0077896 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.09570504 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.05443 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02877784 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01502832 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00775696 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0039332 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.05933192 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.02975696 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.01458912 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00776816 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00391616 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00308944 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02779384 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01499048 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00775552 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00399272 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.0032104 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00299176 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.02973544 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.0151668 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00765784 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00411832 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00301736 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00290664 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.03019944 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01544704 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00798584 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00427288 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00282984 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00283432 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.03030056 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.01520048 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00820648 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00448768 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00285128 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00295096 seconds
+
+blocked reads, workgroups: 8, num_workers: 4, 0.1875092 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.07682712 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04721432 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.0260356 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.0145952 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.01322928 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.11220432 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.05103584 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.0381004 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01598448 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.011868 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.03393464 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.07117064 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.03616536 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.02227952 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01344776 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.0342316 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.08281928 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.03155968 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.02060984 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01327808 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.03418224 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0794408 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.0758292 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.0247952 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.0154636 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01440504 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.03554616 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.0905076 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.06857392 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.0332628 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.02187848 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.0133096 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.03665168 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.06601888 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.05091112 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.03760328 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.02227768 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.01377232 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.03684656 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.04446296 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.036996 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -3,29 +3,35 @@ __kernel void sum_coalesced(__global float* x,
                             __local  float* fast,
                             long N)
 {
+
     float sum = 0;
     size_t local_id = get_local_id(0);
+    int i = get_global_id(0);
+    int k = get_global_size(0);
 
-    // thread i (i.e., with i = get_global_id()) should add x[i],
-    // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    // // thread i (i.e., with i = get_global_id()) should add x[i],
+    // // x[i + get_global_size()], ... up to N-1, and store in sum.
+    for (int counter = i; counter < N; counter += k) {
+        sum += x[counter];
     }
 
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    // // binary reduction
+    // //
+    // // thread i should sum fast[i] and fast[i + offset] and store back
+    // // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // // log_2(local_size)
+    // //
+    // // You can assume get_local_size(0) is a power of 2.
+    // //
+    // // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    for (size_t offset = get_local_size(0) >> 1; offset > 0; offset = offset >> 1) { 
+        if(local_id < offset){
+            fast[local_id] = fast[local_id] + fast[local_id+offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +44,9 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
+    size_t global_id = get_global_id(0);
+
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +56,10 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = 0; i <k; i ++) {
+        if(k * global_id + i < N){
+            sum += x[k * global_id + i];
+        }
     }
 
     fast[local_id] = sum;
@@ -64,8 +74,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (size_t offset = get_local_size(0) >> 1; offset > 0; offset = offset >> 1) { 
+        if(local_id < offset){
+            fast[local_id] = fast[local_id] + fast[local_id+offset];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -1,5 +1,7 @@
 import pyopencl as cl
 import numpy as np
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def create_data(N):
     return host_x, x
@@ -14,23 +16,23 @@ def create_data(N):
     ctx = cl.Context(devices)
 
     queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
-
-    program = cl.Program(ctx, open('sum.cl').read()).build(options='')
-
+    try:
+        program = cl.Program(ctx, open('sum.cl').read()).build(options='')
+    except:
+        print prg.get_build_info(ctx.devices[0], cl.program_build_info.LOG);
     host_x = np.random.rand(N).astype(np.float32)
     x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=host_x)
 
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
             cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
-
             sum_gpu = sum(host_partial)
             sum_host = sum(host_x)
             seconds = (event.profile.end - event.profile.start) / 1e9
@@ -40,7 +42,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,19 @@
 #include "median9.h"
 
+float fetch(__global __read_only float *in_values, int image_w, int image_h, int x, int y){
+  if(x < 0){
+    x = 0;
+  }if(x >= image_w){
+    x = image_w - 1;
+  }
+  if (y < 0){
+    y = 0;
+  }
+  if (y >= image_h){
+    y = image_h - 1;
+  }
+  return in_values[y * image_w + x];
+}
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -9,26 +23,49 @@ median_3x3(__global __read_only float *in_values,
            int buf_w, int buf_h,
            const int halo)
 {
-    // Note: It may be easier for you to implement median filtering
-    // without using the local buffer, first, then adjust your code to
-    // use such a buffer after you have that working.
 
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
 
-    // Load into buffer (with 1-pixel halo).
-    //
-    // It may be helpful to consult HW3 Problem 5, and
-    // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
-    //
-    // Note that globally out-of-bounds pixels should be replaced
-    // with the nearest valid pixel's value.
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
 
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
 
-    // Compute 3x3 median for each pixel in core (non-halo) pixels
-    //
-    // We've given you median9.h, and included it above, so you can
-    // use the median9() function.
+    int row;
 
+    if (idx_1D < buf_w){
+        for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                fetch(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+    }
 
-    // Each thread in the valid region (x < w, y < h) should write
-    // back its 3x3 neighborhood median.
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if ((y < h) && (x < w)){
+      out_values[y * w + x] = median9(
+        buffer[(buf_y - 1) * buf_w + (buf_x - 1)],
+        buffer[(buf_y - 1) * buf_w + (buf_x)],
+        buffer[(buf_y - 1) * buf_w + (buf_x + 1)],
+        buffer[(buf_y) * buf_w + (buf_x - 1)],
+        buffer[(buf_y) * buf_w + (buf_x)],
+        buffer[(buf_y) * buf_w + (buf_x + 1)],
+        buffer[(buf_y + 1) * buf_w + (buf_x - 1)],
+        buffer[(buf_y + 1) * buf_w + (buf_x)],
+        buffer[(buf_y + 1) * buf_w + (buf_x + 1)]);
+  }
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,9 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +52,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,36 @@
+Part 1
+Maze 1: Finished after 885 iterations, 446.19376 ms total, 0.504173740113 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 516 iterations, 254.36296 ms total, 0.492951472868 ms per iteration
+Found 35 regions
+____________________
+Part 2
+Maze 1: Finished after 529 iterations, 266.34816 ms total, 0.5034936862 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 273 iterations, 135.44424 ms total, 0.496132747253 ms per iteration
+Found 35 regions
+_____________________
+Part 3
+Maze 1: Finished after 10 iterations, 5.0064 ms total, 0.50064 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 9 iterations, 4.40704 ms total, 0.489671111111 ms per iteration
+Found 35 regions
+_____________________
+Part 4
+Maze 1: Finished after 891 iterations, 447.6776 ms total, 0.502443995511 ms per iteration
+Found 2 regions
+
+Maze 2: Finished after 520 iterations, 257.42656 ms total, 0.495051076923 ms per iteration
+Found 35 regions
+
+This is clearly much worse. Running it as a single thread is worse than reading multiple times from global memory. Only if the GPU had much slower read times, and the multithreaded version was running with few threads in the workgroup might the result be faster. `s
+_____________________
+Part 5
+Atomic_min is guaranteed to caclulate the minimum and write it into memory simultaneously. This has the effect of avoiding race conditions with min()
+where you might read in the the memory and calculate it's minimum, but before you write it back you were descheduled and another thread modified the same 
+memory. Now when you write your min, it might no longer be the correct min. So labels could increase within an iteration, but not between iterations because
+we have barrier() that guarentees an entire iteration finishes. The final answer will also be correct because if two labels could be written in the wrong order by min() then they are guaranteed to be in the same connected component. The only catch is that you might have an infinite loop where the min always gets scheduled in the wrong order, but that is unlikely. 
+