harvard-cs205 · nchainan · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 12, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -13,7 +13,18 @@ mandelbrot(__global __read_only float *coords_real,
     int iter;
 
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+      x = y * w + x;
+
+      z_real = x;
+      z_imag = y;
+      c_real = x;
+      c_imag = y;
+      iter = 0;
+        while (abs(z) < 2) && (iter < 511) {
+            z_real = z_real*z_real - z_imag*z_imag + c_real;
+            z_imag = 2*z_real*z_imag + c_imag
+            iter++;
+          }
+
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,93 @@
+NOTE: Because my computer does not have a GPU, I ran my code on
+Avery Faller's computer, but unfortunately forgot to save my results,
+so the results I have here are identical to his.
+
+
+configuration ('coalesced', 16, 128): 0.00215712 seconds
+
+Console Output:
+
+coalesced reads, workgroups: 8, num_workers: 4, 0.09040008 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.04001288 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.02093552 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.01104784 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.00579024 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00310792 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.04097784 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.01981504 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.01036864 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.0058096 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00312136 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00215712 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.02147344 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01148328 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.0062996 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00340448 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00237592 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00240984 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02645568 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01386064 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00776248 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00387968 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00273104 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00276912 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.03042928 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.01708752 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.00900744 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00454856 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00311744 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00308584 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.03494984 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.0178412 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00935056 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00466496 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.0033144 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00326072 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.03756072 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.02008384 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.01058008 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00541512 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00369688 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.0035432 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.15486136 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.0681916 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04814448 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.02598048 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01248544 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.0184216 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.06849272 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.03739512 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.02923672 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01343512 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.02388728 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.06485984 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.03443728 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.01843784 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.0122004 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.02377544 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.05319584 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.06486128 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.0276432 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01499512 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.01137568 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.02022752 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.06031472 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.07757752 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02424192 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.01481192 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01068648 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.0210288 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.06772584 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.04927648 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02140304 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01250392 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00837776 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.02519176 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.0549532 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.0319016 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.02212456 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01304008 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.00980616 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.02398448 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.0326936 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.02149976 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,29 +5,25 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    int i = get_global_id(0);
+    int k = get_global_size(0);
+    int j;
+    for (j = i;j< N ; j += k) {
+        sum += x[j];
     }
 
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
-
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for(j = get_local_size(0)/2; j > 0; j >>= 1) {
+            if(local_id < j) {
+              fast[local_id] += fast[local_id+j];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
     }
 
+
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
 
@@ -38,34 +34,22 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    int k = ceil((float)N / get_global_size(0));
 
     // thread with global_id 0 should add 0..k-1
-    // thread with global_id 1 should add k..2k-1
-    // thread with global_id 2 should add 2k..3k-1
-    // ...
-    //     with k = ceil(N / get_global_size()).
-    // 
-    // Be careful that each thread stays in bounds, both relative to
-    // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (int s = k*i;s < k * (i+1);a++) {
+        if (a < N) sum += x[a];
     }
 
     fast[local_id] = sum;
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for(j = get_local_size(0)/2; j > 0; j >>= 1) {
+            if(local_id < j) {
+              fast[local_id] += fast[local_id+j];
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -22,10 +22,10 @@ def create_data(N):
 
     times = {}
 
-    for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+    for num_workgroups in 2 ** np.arange(3, 4):
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
-        for num_workers in 2 ** np.arange(2, 8):
+        for num_workers in 2 ** np.arange(2, 3):
             local = cl.LocalMemory(num_workers * 4)
             event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
                                           x, partial_sums, local, np.uint64(N))
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -12,10 +12,56 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
 
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
 
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+     int row, tx, ty;
+
+     if (idx_1D < buf_w){
+         for (row = 0; row < buf_h; row++) {
+           tx = idx_1D;
+           ty = row;
+          if (tx+buf_corner_x < 0){
+            tx++;
+          } else if(tx+buf_corner_y == w) {
+            tx--;
+          }
+          if(ty+buf_corner_y < 0) {
+            ty++;
+          } else if(ty+buf_corner_y == h) {
+            ty--;
+          }
+             buffer[ty * buf_w + tx] = \
+                 in_values[w*(buf_corner_y + ty)+(buf_corner_x + tx)];
+         }
+        }
+     barrier(CLK_LOCAL_MEM_FENCE);
+
+     if ((y < h) && (x < w)) {// stay in bounds
+        out_values[y * w + x] = \
+                        median9(buffer[(buf_y-1) * buf_w + buf_x-1],\
+                          buffer[(buf_y-1) * buf_w + buf_x],\
+                          buffer[(buf_y-1) * buf_w + buf_x+1],\
+                          buffer[buf_y * buf_w + buf_x-1], \
+                          buffer[buf_y * buf_w + buf_x],\
+                          buffer[buf_y * buf_w + buf_x+1],\
+                          buffer[(buf_y+1) * buf_w + buf_x-1],\
+                          buffer[(buf_y+1) * buf_w + buf_x],\
+                          buffer[(buf_y+1) * buf_w + buf_x-1]);
+        }
     // Load into buffer (with 1-pixel halo).
-    //
+
     // It may be helpful to consult HW3 Problem 5, and
     // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
     //

diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -42,7 +42,7 @@ def numpy_median(image, iterations=10):
 
     # Create a context with all the devices
     devices = platforms[0].get_devices()
-    context = cl.Context(devices)
+    context = cl.Context(devices[2])
     print 'This context is associated with ', len(context.devices), 'devices'
 
     # Create a queue for transferring data and launching computations.
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,39 @@
+NOTE: Because my computer does not have a GPU, I ran my code on
+Isadora Nun's computer, so the results I have here will be near identical
+to hers.
+
+PART 1
+MAZE1:  Finished after 871 iterations, 567.5693 ms total, 0.610802777422 ms per iteration
+        Found 2 regions
+
+MAZE 2: Finished after 509 iterations, 341.0643 ms total, 0.683404097784 ms per iteration
+        Found 35 regions
+
+
+PART 2
+
+MAZE 1: Finished after 531 iterations, 333.93277 ms total, 0.638699130435 ms per iteration
+        Found 2 regions
+
+
+MAZE 2: Finished after 266 iterations, 188.24075 ms total, 0.639058300322 ms per iteration
+        Found 35 regions
+
+PART 3
+MAZE 3: Finished after 9 iterations, 6.11306 ms total, 0.82901 ms per iteration
+        Found 2 regions
+
+
+MAZE 2: Finished after 8 iterations, 6.73873 ms total, 0.83655 ms per iteration
+        Found 35 regions
+
+
+Part 5:
+
+The benefit of atomic_min is that it performs the calculation and swap
+into a single step, writing the min value directly into memory. What this allows
+is the elimination of a race condition, because another thread will not be
+able to interrupt. Although technically min is faster than atomic_min, iterations
+will likely be fewer with atomic_min, because it would be redundant to access
+the same spot in memory again to compare, when that value in memory was updated
+by another thread.