harvard-cs205 · sdaulton · Nov 14, 2015 · Nov 15, 2015 · Nov 15, 2015 · Nov 15, 2015
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,27 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
-    int iter;
+    float z_real, z_imag, z_imag_sq, z_real_sq;
+    int iter, offset;
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        offset = y * w + x;
+        // Get complex number c
+        c_real = coords_real[offset];
+        c_imag = coords_imag[offset];
+        // Initialize z
+        z_real = 0.0;
+        z_imag = 0.0;
+        for (iter = 0; iter < max_iter; iter++) {
+            z_real_sq = z_real * z_real;
+            z_imag_sq = z_imag * z_imag;
+            if (z_real_sq + z_imag_sq > 4) {
+                break;
+            }
+            z_imag = 2 * z_real * z_imag + c_imag;
+            z_real = z_real_sq - z_imag_sq + c_real;
+        }
+        out_counts[offset] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,85 @@
+coalesced reads, workgroups: 8, num_workers: 4, 0.13523816 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.05002128 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.02228568 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.01243688 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.00679152 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00387344 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.05827352 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.02823304 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.01497992 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.00788968 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00434872 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.0030288 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.02866544 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.01335272 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.007186 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00406992 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00277344 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00296696 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.03042336 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01716112 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.01011488 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00509712 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.0039112 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00386304 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.03748808 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.01877584 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.01011024 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00510552 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00361184 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.00353344 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.04078736 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01956208 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.01072384 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00538184 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00410528 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.00426576 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.04820448 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.02555352 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.01377392 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00804696 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00533144 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00534184 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.18848704 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.0736176 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.04906264 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.02473608 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01126728 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.02127384 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.06236696 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.03539944 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.02401944 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01006624 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.01824392 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.0521756 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.02761504 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.01609232 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.01000312 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.01815032 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.04871368 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.06726688 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.02830136 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01547568 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.0123616 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.02625976 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.08451304 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.08008256 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02486112 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.0149828 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01152056 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.0220164 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.06941248 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.05325168 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.02504432 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01262448 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00845736 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.02512984 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.05071976 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.03916664 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.02529368 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01537016 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.0116124 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.03006704 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.04359768 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.0251344 seconds
+configuration ('coalesced', 32, 64): 0.00277344 seconds <-- *** BEST ***
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -1,72 +1,91 @@
-__kernel void sum_coalesced(__global float* x,
-                            __global float* partial,
-                            __local  float* fast,
-                            long N)
-{
-    float sum = 0;
-    size_t local_id = get_local_id(0);
-
-    // thread i (i.e., with i = get_global_id()) should add x[i],
-    // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
-    }
-
-    fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
-}
-
-__kernel void sum_blocked(__global float* x,
-                          __global float* partial,
-                          __local  float* fast,
-                          long N)
-{
-    float sum = 0;
-    size_t local_id = get_local_id(0);
-    int k = ceil((float)N / get_global_size(0));
-
-    // thread with global_id 0 should add 0..k-1
-    // thread with global_id 1 should add k..2k-1
-    // thread with global_id 2 should add 2k..3k-1
-    // ...
-    //     with k = ceil(N / get_global_size()).
-    // 
-    // Be careful that each thread stays in bounds, both relative to
-    // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // binary reduction
-    //
-    // thread i should sum fast[i] and fast[i + offset] and store back
-    // in fast[i], for offset = (local_size >> j) for j from 1 to
-    // log_2(local_size)
-    //
-    // You can assume get_local_size(0) is a power of 2.
-    //
-    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
-    }
-
-    if (local_id == 0) partial[get_group_id(0)] = fast[0];
-}
+__kernel void sum_coalesced(__global float* x,
+                            __global float* partial,
+                            __local  float* fast,
+                            long N)
+{
+    float sum = 0;
+    int offset;
+    uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
+    int global_size = get_global_size(0);
+    uint local_size = get_local_size(0);
+    size_t local_id = get_local_id(0);
+    int i = get_global_id(0);
+    // thread i (i.e., with i = get_global_id()) should add x[i],
+    // x[i + get_global_size()], ... up to N-1, and store in sum.
+    // get thread id
+    for (offset = i; offset < N; offset += global_size) { 
+        sum += x[offset];
+    }
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    for (j = local_size >> 1; j > 0; j >>= 1) {
+        // only make sure j > local_id, so that we store the new sum in the position given by the 
+        // lesser of the two indexes
+        if (local_id < j) {
+            fast[local_id] += fast[local_id + j];   
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
+
+__kernel void sum_blocked(__global float* x,
+                          __global float* partial,
+                          __local  float* fast,
+                          long N)
+{
+    float sum = 0;
+    size_t local_id = get_local_id(0);
+    int k = ceil((float)N / get_global_size(0));
+    int offset;
+    uint j; //unsigned so that we can compare j to size_t local id and add j and a size_t
+    uint local_size = get_local_size(0);
+    int i = get_global_id(0);
+    // thread with global_id 0 should add 0..k-1
+    // thread with global_id 1 should add k..2k-1
+    // thread with global_id 2 should add 2k..3k-1
+    // ...
+    //     with k = ceil(N / get_global_size()).
+    // 
+    // Be careful that each thread stays in bounds, both relative to
+    // size of x (i.e., N), and the range it's assigned to sum.
+    offset = k * i;
+    int max_offset = k * i + k; // last offset thread i should read from
+    while ((offset < max_offset) && (offset < N)) {
+        sum += x[offset];
+        offset++;
+    }
+
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    for (j = local_size >> 1; j > 0; j >>= 1) {
+        // only make sure j > local_id, so that we store the new sum in the position given by the 
+        // lesser of the two indexes
+        if (local_id < j) {
+            fast[local_id] += fast[local_id + j];   
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -1,3 +1,5 @@
+import os
+os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
 import pyopencl as cl
 import numpy as np
 

diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,30 @@
 #include "median9.h"
 
+// ADAPTED FROM HW3 P5 label_regions.cl
+float get_clamped_value(__global __read_only float *in_values,int w, int h, int x, int y) {
+	// if x or y are globally out of bounds, return the coordinates of the closest valid pixel
+	int corrected_x;
+	int corrected_y;
+	//get correct x coord
+	if (x < 0) {
+		corrected_x = 0;
+	} else if (x >= w) {
+		corrected_x = w - 1;
+	} else {
+		corrected_x = x;
+	}
+	//get correct y coord
+	if (y < 0) {
+		corrected_y = 0;
+	} else if (y >= h) {
+		corrected_y = h - 1;
+	} else {
+		corrected_y = y;
+	}
+	// return pixel value
+	return in_values[corrected_y * w + corrected_x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -31,4 +56,54 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+	//ADAPTED FROM https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
+
+	// Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+	int row;
+	// Read pixel values and store in buffer
+	if (idx_1D < buf_w)
+	    for (row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                get_clamped_value(in_values, 
+								  w, h,
+                      			  buf_corner_x + idx_1D,
+                      			  buf_corner_y + row);
+        }
+
+	    barrier(CLK_LOCAL_MEM_FENCE);
+
+		// write output
+	    if ((y < h) && (x < w)) // stay in bounds
+			// Calculate 3x3 median
+	        out_values[y * w + x] = \
+				median9(buffer[(buf_y - 1) * buf_w + buf_x - 1],
+						buffer[(buf_y - 1) * buf_w + buf_x],
+						buffer[(buf_y - 1) * buf_w + buf_x + 1],
+						buffer[buf_y * buf_w + buf_x - 1],
+						buffer[buf_y * buf_w + buf_x],
+						buffer[buf_y * buf_w + buf_x + 1],
+						buffer[(buf_y + 1) * buf_w + buf_x - 1],
+						buffer[(buf_y + 1) * buf_w + buf_x],
+						buffer[(buf_y + 1) * buf_w + buf_x + 1]);
+
 }