diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py
index 697bcd0a..6793b629 100644
--- a/HW2/P2/P2.py
+++ b/HW2/P2/P2.py
@@ -15,14 +15,14 @@
     ########################################
     # Generate some test data, first, uncorrelated
     ########################################
-    orig_counts = np.arange(1000, dtype=np.int32)
-    src = np.random.randint(1000, size=1000000).astype(np.int32)
+    orig_counts = np.arange(1000, dtype=np.int32) # array 0 to 1000
+    src = np.random.randint(1000, size=1000000).astype(np.int32) # array of 1.000.000 random integers 0 to 1000
     dest = np.random.randint(1000, size=1000000).astype(np.int32)
 
-    total = orig_counts.sum()
+    total = orig_counts.sum() # sum of 1 to 1000
 
     # serial move
-    counts = orig_counts.copy()
+    counts = orig_counts.copy() # copy of orig_counts
     with Timer() as t:
         move_data_serial(counts, src, dest, 100)
     assert counts.sum() == total, "Wrong total after move_data_serial"
@@ -30,17 +30,17 @@
     serial_counts = counts.copy()
 
     # fine grained
-    counts[:] = orig_counts
+    counts[:] = orig_counts #?
     with Timer() as t:
         move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    #assert counts.sum() == total, "Wrong total after move_data_fine_grained"
     print("Fine grained uncorrelated: {} seconds".format(t.interval))
 
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
+    N = 20
     counts[:] = orig_counts
     with Timer() as t:
         move_data_medium_grained(counts, src, dest, 100, N)
@@ -50,8 +50,8 @@
     ########################################
     # Now use correlated data movement
     ########################################
-    dest = src + np.random.randint(-10, 11, size=src.size)
-    dest[dest < 0] += 1000
+    dest = src + np.random.randint(-10, 11, size=src.size) # add -10 to 11 to each element
+    dest[dest < 0] += 1000 # if element < 0 add 1000
     dest[dest >= 1000] -= 1000
     dest = dest.astype(np.int32)
 
@@ -67,14 +67,14 @@
     counts[:] = orig_counts
     with Timer() as t:
         move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    #assert counts.sum() == total, "Wrong total after move_data_fine_grained"
     print("Fine grained correlated: {} seconds".format(t.interval))
 
     ########################################
     # You should explore different values for the number of locks in the medium
-    # grained locking
+    # grainedlocking
     ########################################
-    N = 10
+    N = 20
     counts[:] = orig_counts
     with Timer() as t:
         move_data_medium_grained(counts, src, dest, 100, N)
diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt
new file mode 100644
index 00000000..6dc06428
--- /dev/null
+++ b/HW2/P2/P2.txt
@@ -0,0 +1,7 @@
+To implement the move_data_fine_grained function each count index was locked with an individual lock before comparison/increment/decrement. Afterwards the index was unlocked again. To prevent deadlocking the index that has the smaller value was always locked and unlocked first.
+
+The function move_data_medium_grained was implemented similarly, with the exception that every nth counr value shared a lock.
+
+The move_data_serial function was always executed fastest (~ 0.75 s) independent of whether the data movement was correlated. The move_data_fine_grained function ran significantly slower (~ 16 s). This can be attributed to the overhead created by lock acquisition and release. The performance did not depend on data movement correlation, since each value of count had its own lock. 
+
+The move_data_medium_grained function executed slowest. The runtime dependened on the correlation of data movement and number of elements that share a lock. Generally, the performeance is better for a correlated data set. The runtime is ~ 20 s and starts to increase for N > 20. This is de to the fact that while N < 20 increasing the number of locks (increasing N) creates more overhead, but at the same time reduces the probability of collisions since source and destination are max. 10 elements away. For N > 20 the number of collisions is no longer significantly reduced but the overhead due more locks keeps growing. Therefore we see increasing execution time for N > 20. Setting N = 20 is a good choice fo correlated data movements.
\ No newline at end of file
diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx
index 5d67797f..00d992aa 100644
--- a/HW2/P2/parallel_vector.pyx
+++ b/HW2/P2/parallel_vector.pyx
@@ -66,27 +66,43 @@ cpdef move_data_serial(np.int32_t[:] counts,
                    counts[src[idx]] -= 1
 
 
+
 cpdef move_data_fine_grained(np.int32_t[:] counts,
                              np.int32_t[:] src,
                              np.int32_t[:] dest,
                              int repeat):
-   cdef:
-       int idx, r
-       omp_lock_t *locks = get_N_locks(counts.shape[0])
+  cdef:
+        int idx, r, small_idx, big_idx
+        omp_lock_t *locks = get_N_locks(counts.shape[0])
 
    ##########
    # Your code here
    # Use parallel.prange() and a lock for each element of counts to parallelize
    # data movement.  Be sure to avoid deadlock, and double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
+  with nogil:
+    for r in range(repeat):
+        for idx in prange(src.shape[0], num_threads=4, schedule='static'):
+
+          small_idx = min(src[idx], dest[idx])
+          big_idx = max(src[idx], dest[idx])
+
+          # acquire lock for smaller index that holds smaller value first
+          if small_idx == big_idx:
+            continue
+          else:
+            acquire(&(locks[small_idx]))
+            acquire(&(locks[big_idx]))
 
-   free_N_locks(counts.shape[0], locks)
+          if counts[src[idx]] > 0:
+            counts[dest[idx]] += 1
+            counts[src[idx]] -= 1
+
+          # release lock for smaller index that holds smaller value first
+          release(&(locks[small_idx]))
+          release(&(locks[big_idx]))
+
+  free_N_locks(counts.shape[0], locks)
 
 
 cpdef move_data_medium_grained(np.int32_t[:] counts,
@@ -94,10 +110,10 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
                                np.int32_t[:] dest,
                                int repeat,
                                int N):
-   cdef:
-       int idx, r
-       int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
-       omp_lock_t *locks = get_N_locks(num_locks)
+  cdef:
+      int idx, r, small_idx, big_idx
+      int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
+      omp_lock_t *locks = get_N_locks(num_locks)
 
    ##########
    # Your code here
@@ -105,11 +121,28 @@ cpdef move_data_medium_grained(np.int32_t[:] counts,
    # to parallelize data movement.  Be sure to avoid deadlock, as well as
    # double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
-
-   free_N_locks(num_locks, locks)
+  with nogil:
+    for r in range(repeat):
+      for idx in prange(src.shape[0], num_threads=4, schedule='static'):
+
+        small_idx = min(src[idx], dest[idx])
+        big_idx = max(src[idx], dest[idx])
+
+        # acquire lock for smaller index that holds smaller value first
+        if small_idx == big_idx:
+          continue
+        else:
+          acquire(&(locks[small_idx/N]))
+          if small_idx/N != big_idx/N:
+            acquire(&(locks[big_idx/N]))
+
+        if counts[src[idx]] > 0:
+          counts[dest[idx]] += 1
+          counts[src[idx]] -= 1
+
+        # release lock for smaller index that holds smaller value first
+        release(&(locks[small_idx/N]))
+        if small_idx/N != big_idx/N:
+            release(&(locks[big_idx/N]))
+
+  free_N_locks(num_locks, locks)
diff --git a/HW2/P2/runtimes.txt b/HW2/P2/runtimes.txt
new file mode 100644
index 00000000..e2319112
--- /dev/null
+++ b/HW2/P2/runtimes.txt
@@ -0,0 +1,47 @@
+N = 5:
+Serial uncorrelated: 0.775149822235 seconds
+Fine grained uncorrelated: 15.9034318924 seconds
+Medium grained uncorrelated: 24.0580909252 seconds
+Serial correlated: 0.735618114471 seconds
+Fine grained correlated: 14.2648720741 seconds
+Medium grained correlated: 20.8760027885 seconds
+
+N = 10:
+Serial uncorrelated: 0.723427057266 seconds
+Fine grained uncorrelated: 15.4527769089 seconds
+Medium grained uncorrelated: 28.0213057995 seconds
+Serial correlated: 0.740005970001 seconds
+Fine grained correlated: 14.5977950096 seconds
+Medium grained correlated: 21.4254288673 seconds
+
+N = 20:
+Serial uncorrelated: 0.762849807739 seconds
+Fine grained uncorrelated: 16.1108250618 seconds
+Medium grained uncorrelated: 38.8494958878 seconds
+Serial correlated: 0.881717920303 seconds
+Fine grained correlated: 14.2473139763 seconds
+Medium grained correlated: 20.7328078747 seconds
+
+N = 30:
+Serial uncorrelated: 0.743113040924 seconds
+Fine grained uncorrelated: 15.8863618374 seconds
+Medium grained uncorrelated: 43.2022929192 seconds
+Serial correlated: 0.729395151138 seconds
+Fine grained correlated: 14.3881089687 seconds
+Medium grained correlated: 22.3900079727 seconds
+
+N = 40:
+Serial uncorrelated: 0.74767780304 seconds
+Fine grained uncorrelated: 15.442111969 seconds
+Medium grained uncorrelated: 50.5700490475 seconds
+Serial correlated: 0.726634025574 seconds
+Fine grained correlated: 16.1129579544 seconds
+Medium grained correlated: 29.9155938625 seconds
+
+N = 50:
+Serial uncorrelated: 0.754047870636 seconds
+Fine grained uncorrelated: 15.4274530411 seconds
+Medium grained uncorrelated: 57.035820961 seconds
+Serial correlated: 0.739080905914 seconds
+Fine grained correlated: 14.5144851208 seconds
+Medium grained correlated: 27.2443861961 seconds
diff --git a/HW2/P3/P3.txt b/HW2/P3/P3.txt
new file mode 100644
index 00000000..3334647f
--- /dev/null
+++ b/HW2/P3/P3.txt
@@ -0,0 +1,11 @@
+With instruction level parallelism:
+4 threads: 1074.94829 Million Complex FMAs in 0.973726987839 seconds, 1103.95244604 million Complex FMAs / second
+2 threads: 1074.94829 Million Complex FMAs in 0.972718954086 seconds, 1105.09647775 million Complex FMAs / second
+1 thread:  1074.94829 Million Complex FMAs in 1.91828012466 seconds, 560.370863556 million Complex FMAs / second
+
+With no instruction level parallelism:
+4 threads: 1074.656613 Million Complex FMAs in 4.93152499199 seconds, 217.915678162 million Complex FMAs / second
+2 threads: 1074.656613 Million Complex FMAs in 4.95052313805 seconds, 217.079404142 million Complex FMAs / second
+1 thread:  1074.656613 Million Complex FMAs in 9.46772193909 seconds, 113.507411806 million Complex FMAs / second
+
+Performing the computtion on two threads doubles the calculation speed. Using instruction level parallelism additionaly increases the calculation speed by factor 5. Calculating the mandelbrot set is an "embarassingly parallel" task.
\ No newline at end of file
diff --git a/HW2/P3/mandelbrot.pyx b/HW2/P3/mandelbrot.pyx
index a019cfa2..3487f65a 100644
--- a/HW2/P3/mandelbrot.pyx
+++ b/HW2/P3/mandelbrot.pyx
@@ -5,9 +5,17 @@ import numpy
 cimport AVX
 from cython.parallel import prange
 
+cdef void counts_to_output(AVX.float8 counts,
+                      np.uint32_t[:, :] out_counts,
+                      int i, int j) nogil:
+    cdef:
+        float tmp_counts[8]
+        int k
+        
+    AVX.to_mem(counts, &(tmp_counts[0]))
+    for k in range(8):
+        out_counts[i,j+k] = int(tmp_counts[k])
 
-cdef np.float64_t magnitude_squared(np.complex64_t z) nogil:
-    return z.real * z.real + z.imag * z.imag
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -16,62 +24,60 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords,
                  int max_iterations=511):
     cdef:
        int i, j, iter
-       np.complex64_t c, z
+       AVX.float8 c_real, c_imag, z_real, z_imag, counts, eight_ones, z_squared, mask, limit, eight_ones_masked, z_real_temp, z_imag_temp
+       np.ndarray[np.float32_t, ndim=2] in_coords_real, in_coords_imag,
+       float tmp_counts[8]
 
-       # To declare AVX.float8 variables, use:
-       # cdef:
-       #     AVX.float8 v1, v2, v3
-       #
-       # And then, for example, to multiply them
-       #     v3 = AVX.mul(v1, v2)
-       #
-       # You may find the numpy.real() and numpy.imag() fuctions helpful.
+    # split complex numbers in real and imaginary parts
+    in_coords_real = np.real(in_coords)
+    in_coords_imag = np.imag(in_coords)
 
     assert in_coords.shape[1] % 8 == 0, "Input array must have 8N columns"
     assert in_coords.shape[0] == out_counts.shape[0], "Input and output arrays must be the same size"
     assert in_coords.shape[1] == out_counts.shape[1],  "Input and output arrays must be the same size"
 
     with nogil:
-        for i in range(in_coords.shape[0]):
-            for j in range(in_coords.shape[1]):
-                c = in_coords[i, j]
-                z = 0
-                for iter in range(max_iterations):
-                    if magnitude_squared(z) > 4:
-                        break
-                    z = z * z + c
-                out_counts[i, j] = iter
-
-
+        for i in prange(in_coords.shape[0], num_threads=2, schedule='static', chunksize=1):
+            for j in range(0, in_coords.shape[1], 8):
+
+                # initialize variables
+                c_real = AVX.make_float8(in_coords_real[i, j+7], in_coords_real[i, j+6],
+                                         in_coords_real[i, j+5], in_coords_real[i, j+4],
+                                         in_coords_real[i, j+3], in_coords_real[i, j+2],
+                                         in_coords_real[i, j+1], in_coords_real[i, j+0])
+
+                c_imag = AVX.make_float8(in_coords_imag[i, j+7], in_coords_imag[i, j+6],
+                                         in_coords_imag[i, j+5], in_coords_imag[i, j+4],
+                                         in_coords_imag[i, j+3], in_coords_imag[i, j+2],
+                                         in_coords_imag[i, j+1], in_coords_imag[i, j+0])
+
+                z_real = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                z_imag = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                counts = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                eight_ones = AVX.make_float8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
+                limit = AVX.make_float8(4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
 
-# An example using AVX instructions
-cpdef example_sqrt_8(np.float32_t[:] values):
-    cdef:
-        AVX.float8 avxval, tmp, mask
-        float out_vals[8]
-        float [:] out_view = out_vals
-
-    assert values.shape[0] == 8
+                for iter in range(max_iterations):
+                    
+                    # calculate z^2
+                    z_squared = AVX.add(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag))
 
-    # Note that the order of the arguments here is opposite the direction when
-    # we retrieve them into memory.
-    avxval = AVX.make_float8(values[7],
-                             values[6],
-                             values[5],
-                             values[4],
-                             values[3],
-                             values[2],
-                             values[1],
-                             values[0])
+                    # get mask
+                    mask = AVX.less_than(z_squared, limit)
 
-    avxval = AVX.sqrt(avxval)
+                    # check when to break out of the loop
+                    if (AVX.signs(mask) == 0): break
 
-    # mask will be true where 2.0 < avxval
-    mask = AVX.less_than(AVX.float_to_float8(2.0), avxval)
+                    # apply mask on array of eight ones
+                    eight_ones_masked = AVX.bitwise_and(mask, eight_ones)
 
-    # invert mask and select off values, so should be 2.0 >= avxval
-    avxval = AVX.bitwise_andnot(mask, avxval)
+                    # update counts 
+                    counts = AVX.add(counts, AVX.bitwise_and(mask, eight_ones_masked)) 
 
-    AVX.to_mem(avxval, &(out_vals[0]))
+                    # get new z_real and z_imag
+                    z_real_temp = AVX.sub(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag))
+                    z_imag_temp= AVX.add(AVX.mul(z_real, z_imag), AVX.mul(z_real, z_imag))
+                    z_real = AVX.add(z_real_temp, c_real)
+                    z_imag = AVX.add(z_imag_temp, c_imag)
 
-    return np.array(out_view)
+                counts_to_output(counts, out_counts, i, j)
diff --git a/HW2/P4/P4.txt b/HW2/P4/P4.txt
new file mode 100644
index 00000000..1507572a
--- /dev/null
+++ b/HW2/P4/P4.txt
@@ -0,0 +1,5 @@
+1 thread:  6.60881018639 seconds for 10 filter passes.
+2 threads: 3.62633299828 seconds for 10 filter passes.
+4 threads: 3.69899082184 seconds for 10 filter passes.
+
+I was running this code on a dual-core machine without hyperthreading support. Using 2 threads therefore result in almost 2x speedup, while using 4 threads did not bring any additional performance. In my code I use the thread number as the offset and the number of threads as the steps size. Thus each thread computes the every nth line. Once all threads are done computing, the next iteration ca begin. I use the thread.join() command to wait for each of the threads to finish.
\ No newline at end of file
diff --git a/HW2/P4/driver.py b/HW2/P4/driver.py
index 0051a121..315f14d2 100644
--- a/HW2/P4/driver.py
+++ b/HW2/P4/driver.py
@@ -15,14 +15,22 @@
 from timer import Timer
 import threading
 
+def worker(tmpA, tmpB, j, num_threads):
+    return filtering.median_3x3(tmpA, tmpB, j, num_threads)
+    
 def py_median_3x3(image, iterations=10, num_threads=1):
     ''' repeatedly filter with a 3x3 median '''
     tmpA = image.copy()
     tmpB = np.empty_like(tmpA)
 
+    threads = []
     for i in range(iterations):
-        filtering.median_3x3(tmpA, tmpB, 0, 1)
-        # swap direction of filtering
+        for j in range(num_threads):
+            t = threading.Thread(target=worker, args=(tmpA, tmpB, j, num_threads))
+            threads.append(t)
+            t.start()
+        for thread in threads:
+            thread.join()
         tmpA, tmpB = tmpB, tmpA
 
     return tmpA
@@ -57,7 +65,7 @@ def numpy_median(image, iterations=10):
     assert np.all(from_cython == from_numpy)
 
     with Timer() as t:
-        new_image = py_median_3x3(input_image, 10, 8)
+        new_image = py_median_3x3(input_image, 10, 4)
 
     pylab.figure()
     pylab.imshow(new_image[1200:1800, 3000:3500])
diff --git a/HW2/P5/P5.txt b/HW2/P5/P5.txt
new file mode 100644
index 00000000..5939486e
--- /dev/null
+++ b/HW2/P5/P5.txt
@@ -0,0 +1,34 @@
+Original code:
+2.2 simulation frames per second
+
+Multithreading:
+4 threads: ~ 3.8 simulation frames per second
+2 threads: ~ 2.8 simulation frames per second
+1 thread: ~ 2.2 simulation frames per second
+
+Adding threads allows parallel calculation for multiple balls and speeds up the execution.
+
+Spatial decomposition:
+4 threads: ~ 330 simulation frames per second
+2 threads: ~ 350 simulation frames per second
+1 thread: ~ 210 simulation frames per second
+
+Checking for collision only for balls close to each other dramatiaclly speeds up the program.
+A lot of unnecessary computation is avoided. I chose Morton ordering because it generates less processor 
+overhead compared to Hilbert ordering.
+
+With sorting:
+4 threads: ~ 380 simulation frames per second
+2 threads: ~ 430 simulation frames per second
+1 thread: ~ 230 simulation frames per second
+
+Sorting of balls by location results in balls that can potentially collide being stored close in memory.
+This slightly speeds up the execution regardless number of threads.
+
+With locks:
+4 threads: ~ 240 simulation frames per second
+2 threads: ~ 240 simulation frames per second
+1 threads: ~ 150 simulation frames per second
+
+Fine grain-locking adds a lot of overhead and significantly slows down the execution.
+
diff --git a/HW2/P5/driver.py b/HW2/P5/driver.py
index fdd1a102..0ef302ee 100644
--- a/HW2/P5/driver.py
+++ b/HW2/P5/driver.py
@@ -16,6 +16,18 @@
 def randcolor():
     return np.random.uniform(0.0, 0.89, (3,)) + 0.1
 
+# morton ordering taken from wikipedia
+def cmp_zorder(a, b):
+    j = 0
+    k = 0
+    x = 0
+    for k in range(2):
+        y = a[k] ^ b[k]
+        if x < y and x < (x ^ y):
+            j = k
+            x = y
+    return a[j] - b[j]
+    
 if __name__ == '__main__':
     num_balls = 10000
     radius = 0.002
@@ -41,12 +53,11 @@ def randcolor():
     # Each square in the grid stores the index of the object in that square, or
     # -1 if no object.  We don't worry about overlapping objects, and just
     # store one of them.
-    grid_spacing = radius / np.sqrt(2.0)
+    grid_spacing = radius * np.sqrt(2.0)
     grid_size = int((1.0 / grid_spacing) + 1)
     grid = - np.ones((grid_size, grid_size), dtype=np.uint32)
     grid[(positions[:, 0] / grid_spacing).astype(int),
          (positions[:, 1] / grid_spacing).astype(int)] = np.arange(num_balls)
-
     # A matplotlib-based animator object
     animator = Animator(positions, radius * 2)
 
@@ -61,22 +72,45 @@ def randcolor():
     # preallocate locks for objects
     locks_ptr = preallocate_locks(num_balls)
 
+    #frame_list = []
     while True:
         with Timer() as t:
             update(positions, velocities, grid,
-                   radius, grid_size, locks_ptr,
+                   radius, grid_spacing, locks_ptr,
                    physics_step)
 
         # udpate our estimate of how fast the simulator runs
         physics_step = 0.9 * physics_step + 0.1 * t.interval
         total_time += t.interval
-
         frame_count += 1
         if total_time > anim_step:
             animator.update(positions)
             print("{} simulation frames per second".format(frame_count / total_time))
+            #frame_list.append(frame_count/total_time)
             frame_count = 0
             total_time = 0
             # SUBPROBLEM 3: sort objects by location.  Be sure to update the
             # grid if objects' indices change!  Also be sure to sort the
-            # velocities with their object positions!
+            # velocities with their object positions!  
+
+        
+            # get postion of each ball
+            ball_indices = ((positions / grid_spacing).astype(int)).tolist()
+            # shift all position by one within the list
+            shifted_ball_indices = ball_indices[1:] + ball_indices[:1]
+            # apply morton ordering
+            zorder = []
+            for i in range(num_balls):
+                zorder.append(cmp_zorder(ball_indices[i], shifted_ball_indices[i]))
+            # make an array of indices for each zuorder element
+            zorder_indices = np.argsort(zorder)
+
+            # update positions and velocities
+            positions = positions[zorder_indices]
+            velocities = velocities[zorder_indices]
+            # update balls
+            for i in range(num_balls):
+                if 0 <= positions[i,0] <= 1 and 0 <= positions[i,1] <= 1:
+                    grid[(positions[i,0] / grid_spacing).astype(int),
+                         (positions[i,1] / grid_spacing).astype(int)] = i 
+    
\ No newline at end of file
diff --git a/HW2/P5/physics.pyx b/HW2/P5/physics.pyx
index dbd78e6e..fa7322e6 100644
--- a/HW2/P5/physics.pyx
+++ b/HW2/P5/physics.pyx
@@ -1,6 +1,8 @@
-#cython: boundscheck=False, wraparound=False
+boundscheck=False
+wraparound=False
 
 cimport numpy as np
+from cython.parallel import parallel, prange
 from libc.math cimport sqrt
 from libc.stdint cimport uintptr_t
 cimport cython
@@ -44,6 +46,7 @@ cdef inline void collide(FLOAT *x1, FLOAT *v1,
         v1_minus_v2[dim] = v1[dim] - v2[dim]
     len_x1_m_x2 = x1_minus_x2[0] * x1_minus_x2[0] + x1_minus_x2[1] * x1_minus_x2[1]
     dot_v_x = v1_minus_v2[0] * x1_minus_x2[0] + v1_minus_v2[1] * x1_minus_x2[1]
+    
     for dim in range(2):
         change_v1[dim] = (dot_v_x / len_x1_m_x2) * x1_minus_x2[dim]
     for dim in range(2):
@@ -55,31 +58,43 @@ cdef void sub_update(FLOAT[:, ::1] XY,
                      float R,
                      int i, int count,
                      UINT[:, ::1] Grid,
-                     float grid_spacing) nogil:
+                     float grid_spacing,
+                     omp_lock_t *locks) nogil:
     cdef:
-        FLOAT *XY1, *XY2, *V1, *V2
-        int j, dim
+        FLOAT *XY1, *XY2, *V1, *V2,
+        int dim, X1_idx, Y1_idx, idx_x, idx_y
         float eps = 1e-5
 
-    # SUBPROBLEM 4: Add locking
     XY1 = &(XY[i, 0])
     V1 = &(V[i, 0])
+
+    X1_idx = int(XY1[0]/grid_spacing)
+    Y1_idx = int(XY1[1]/grid_spacing)
     #############################################################
     # IMPORTANT: do not collide two balls twice.
-    ############################################################
+    ############################+
+    ################################
     # SUBPROBLEM 2: use the grid values to reduce the number of other
     # objects to check for collisions.
-    for j in range(i + 1, count):
-        XY2 = &(XY[j, 0])
-        V2 = &(V[j, 0])
-        if overlapping(XY1, XY2, R):
-            # SUBPROBLEM 4: Add locking
-            if not moving_apart(XY1, V1, XY2, V2):
-                collide(XY1, V1, XY2, V2)
-
-            # give a slight impulse to help separate them
-            for dim in range(2):
-                V2[dim] += eps * (XY2[dim] - XY1[dim])
+    
+    # check for colliding balls in a 5x5 grid
+    #acquire(&(locks[Grid[idx_x, idx_y]]))
+    for idx_x in range(X1_idx - 2, X1_idx + 3):
+        for idx_y in range(Y1_idx - 2, Y1_idx + 3):
+            if idx_x < Grid.shape[0] and idx_y < Grid.shape[0] and Grid[idx_x, idx_y] < count and Grid[idx_x, idx_y] != i:
+                    # lock indices
+                    acquire(&(locks[Grid[idx_x, idx_y]]))
+                    XY2 = &(XY[Grid[idx_x, idx_y],0])
+                    V2 = &(V[Grid[idx_x, idx_y],0])
+                    
+                    if overlapping(XY1, XY2, R):
+                        if not moving_apart(XY1, V1, XY2, V2):
+                            collide(XY1, V1, XY2, V2)
+                            
+                        for dim in range(2):
+                            V2[dim] += eps * (XY2[dim] - XY1[dim])
+                    # release locks
+                    release(&(locks[Grid[idx_x, idx_y]]))
 
 cpdef update(FLOAT[:, ::1] XY,
              FLOAT[:, ::1] V,
@@ -90,10 +105,10 @@ cpdef update(FLOAT[:, ::1] XY,
              float t):
     cdef:
         int count = XY.shape[0]
-        int i, j, dim
+        int i, dim
         FLOAT *XY1, *XY2, *V1, *V2
         # SUBPROBLEM 4: uncomment this code.
-        # omp_lock_t *locks = <omp_lock_t *> <void *> locks_ptr
+        omp_lock_t *locks = <omp_lock_t *> <void *> locks_ptr
 
     assert XY.shape[0] == V.shape[0]
     assert XY.shape[1] == V.shape[1] == 2
@@ -103,7 +118,7 @@ cpdef update(FLOAT[:, ::1] XY,
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads, with static
         # scheduling.
-        for i in range(count):
+        for i in prange(count, num_threads=4, schedule='static', chunksize=count/4):
             for dim in range(2):
                 if (((XY[i, dim] < R) and (V[i, dim] < 0)) or
                     ((XY[i, dim] > 1.0 - R) and (V[i, dim] > 0))):
@@ -113,17 +128,22 @@ cpdef update(FLOAT[:, ::1] XY,
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads, with static
         # scheduling.
-        for i in range(count):
-            sub_update(XY, V, R, i, count, Grid, grid_spacing)
+        for i in prange(count, num_threads=4, schedule='static', chunksize=count/4):
+            sub_update(XY, V, R, i, count, Grid, grid_spacing, locks)
 
         # update positions
         #
         # SUBPROBLEM 1: parallelize this loop over 4 threads (with static
         #    scheduling).
         # SUBPROBLEM 2: update the grid values.
-        for i in range(count):
+        for i in prange(count, num_threads=4, schedule='static', chunksize=count/4):
+            if 0 <= XY[i,0] <=1 and 0 <= XY[i,1] <=1:
+                Grid[int(XY[i,0]/grid_spacing), int(XY[i,1]/grid_spacing)] = -1
             for dim in range(2):
                 XY[i, dim] += V[i, dim] * t
+            if 0 <= XY[i,0] <=1 and 0 <= XY[i,1] <=1:
+                Grid[int(XY[i,0]/grid_spacing), int(XY[i,1]/grid_spacing)] = i
+            
 
 
 def preallocate_locks(num_locks):
diff --git a/HW3/.DS_Store b/HW3/.DS_Store
new file mode 100644
index 00000000..c8600450
Binary files /dev/null and b/HW3/.DS_Store differ
diff --git a/HW3/P3/.DS_Store b/HW3/P3/.DS_Store
new file mode 100644
index 00000000..c724f0fd
Binary files /dev/null and b/HW3/P3/.DS_Store differ
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
new file mode 100644
index 00000000..a4faf803
--- /dev/null
+++ b/HW3/P3/P3.txt
@@ -0,0 +1,3 @@
+Best configuration for my system:
+
+('coalesced', 32, 64): 0.012521504 seconds
\ No newline at end of file
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
new file mode 100755
index 00000000..23126300
--- /dev/null
+++ b/HW3/P3/sum.cl
@@ -0,0 +1,91 @@
+__kernel void sum_coalesced(__global float* x,
+                            __global float* partial,
+                            __local  float* fast,
+                            long N)
+{
+    float sum = 0;
+    size_t local_id = get_local_id(0);
+    size_t global_id = get_global_id(0);
+    size_t global_size = get_global_size(0);
+    size_t local_size = get_local_size(0);
+
+    // thread i (i.e., with i = get_global_id()) should add x[i],
+    // x[i + get_global_size()], ... up to N-1, and store in sum.
+    size_t idx = global_id;
+    for (idx; idx < N; idx += global_size)
+    {
+        sum += x[idx];
+    }
+
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    size_t offset = local_size/2;
+    for (offset; offset > 0; offset >>= 1)
+    {
+        if (local_id < offset)
+            fast[local_id] += fast[local_id + offset];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
+
+__kernel void sum_blocked(__global float* x,
+                          __global float* partial,
+                          __local  float* fast,
+                          long N)
+{
+    float sum = 0;
+    size_t local_id = get_local_id(0);
+    int k = ceil((float)N / get_global_size(0));
+    size_t global_id = get_global_id(0);
+    size_t local_size = get_local_size(0);
+
+    // thread with global_id 0 should add 0..k-1
+    // thread with global_id 1 should add k..2k-1
+    // thread with global_id 2 should add 2k..3k-1
+    // ...
+    //     with k = ceil(N / get_global_size()).
+    // 
+    // Be careful that each thread stays in bounds, both relative to
+    // size of x (i.e., N), and the range it's assigned to sum.
+    size_t idx = k * global_id;
+    size_t range_bound = k * (global_id + 1);
+    for (idx; idx < range_bound; idx++)
+    {
+        if (idx < N)
+            sum += x[idx];
+    }
+
+    fast[local_id] = sum;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // binary reduction
+    //
+    // thread i should sum fast[i] and fast[i + offset] and store back
+    // in fast[i], for offset = (local_size >> j) for j from 1 to
+    // log_2(local_size)
+    //
+    // You can assume get_local_size(0) is a power of 2.
+    //
+    // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
+    size_t offset = local_size/2;
+    for (offset; offset > 0; offset >>= 1)
+    {
+        if (local_id < offset)
+            fast[local_id] += fast[local_id + offset];
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (local_id == 0) partial[get_group_id(0)] = fast[0];
+}
diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
new file mode 100755
index 00000000..a0d56da2
--- /dev/null
+++ b/HW3/P3/tune.py
@@ -0,0 +1,61 @@
+import pyopencl as cl
+import numpy as np
+
+def create_data(N):
+    return host_x, x
+
+if __name__ == "__main__":
+    N = 1e7
+
+    platforms = cl.get_platforms()
+    devices = [d for platform in platforms for d in platform.get_devices()]
+    for i, d in enumerate(devices):
+        print("#{0}: {1} on {2}".format(i, d.name, d.platform.name))
+    ctx = cl.Context(devices)
+
+    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    program = cl.Program(ctx, open('sum.cl').read()).build(options='')
+
+    host_x = np.random.rand(N).astype(np.float32)
+    x = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=host_x)
+
+    times = {}
+
+    for num_workgroups in 2 ** np.arange(3, 10):
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
+        host_partial = np.empty(num_workgroups).astype(np.float32)
+        for num_workers in 2 ** np.arange(2, 8):
+            local = cl.LocalMemory(num_workers * 4)
+            event = program.sum_coalesced(queue, (num_workgroups * num_workers,), (num_workers,),
+                                          x, partial_sums, local, np.uint64(N))
+            cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
+
+            sum_gpu = sum(host_partial)
+            sum_host = sum(host_x)
+            seconds = (event.profile.end - event.profile.start) / 1e9
+            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            times['coalesced', num_workgroups, num_workers] = seconds
+            print("coalesced reads, workgroups: {}, num_workers: {}, {} seconds".
+                  format(num_workgroups, num_workers, seconds))
+
+    for num_workgroups in 2 ** np.arange(3, 10):
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
+        host_partial = np.empty(num_workgroups).astype(np.float32)
+        for num_workers in 2 ** np.arange(2, 8):
+            local = cl.LocalMemory(num_workers * 4)
+            event = program.sum_blocked(queue, (num_workgroups * num_workers,), (num_workers,),
+                                        x, partial_sums, local, np.uint64(N))
+            cl.enqueue_copy(queue, host_partial, partial_sums, is_blocking=True)
+
+            sum_gpu = sum(host_partial)
+            sum_host = sum(host_x)
+            seconds = (event.profile.end - event.profile.start) / 1e9
+            assert abs((sum_gpu - sum_host) / max(sum_gpu, sum_host)) < 1e-4
+            times['blocked', num_workgroups, num_workers] = seconds
+            print("blocked reads, workgroups: {}, num_workers: {}, {} seconds".
+                  format(num_workgroups, num_workers, seconds))
+
+    best_time = min(times.values())
+    best_configuration = [config for config in times if times[config] == best_time]
+    print("configuration {}: {} seconds".format(best_configuration[0], best_time))
diff --git a/HW3/P4/.DS_Store b/HW3/P4/.DS_Store
new file mode 100644
index 00000000..f625d5f0
Binary files /dev/null and b/HW3/P4/.DS_Store differ
diff --git a/HW3/P4/median9.h b/HW3/P4/median9.h
new file mode 100755
index 00000000..0b9c8252
--- /dev/null
+++ b/HW3/P4/median9.h
@@ -0,0 +1,47 @@
+// median9.h - branchless median 
+
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#define max(a, b) (((a) < (b)) ? (b) : (a))
+
+#define cas(a, b) tmp = min(a, b); b = max(a, b); a = tmp
+
+inline float median9(float s0, float s1, float s2,
+                     float s3, float s4, float s5,
+                     float s6, float s7, float s8)
+{
+    // http://a-hackers-craic.blogspot.com/2011/05/3x3-median-filter-or-branchless.html
+    float tmp;
+        
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s0, s1);
+    cas(s3, s4);
+    cas(s6, s7);
+
+    cas(s1, s2);
+    cas(s4, s5);
+    cas(s7, s8);
+
+    cas(s3, s6);
+    cas(s4, s7);
+    cas(s5, s8);
+    cas(s0, s3);
+
+    cas(s1, s4);
+    cas(s2, s5);
+    cas(s3, s6);
+
+    cas(s4, s7);
+    cas(s1, s3);
+
+    cas(s2, s6);
+
+    cas(s2, s3);
+    cas(s4, s6);
+
+    cas(s3, s4);
+
+    return s4;
+}
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
new file mode 100755
index 00000000..e5a339d7
--- /dev/null
+++ b/HW3/P4/median_filter.cl
@@ -0,0 +1,73 @@
+#include "median9.h"
+
+// prototype
+float get_next(__global __read_only float *in_values, int idx_x, int idx_y, int w, int h);
+
+// 3x3 median filter
+__kernel void
+median_3x3(__global __read_only float *in_values,
+           __global __write_only float *out_values,
+           __local float *buffer,
+           int w, int h,
+           int buf_w, int buf_h,
+           const int halo)
+{
+  // local coordinates
+  int local_x = get_local_id(0);
+  int local_y = get_local_id(1);
+
+  // global coordinates
+  int global_x = get_global_id(0);
+  int global_y = get_global_id(1);
+
+  // corrdinates of the upper left corner of the buffer square 
+  int buffer_upperleft_corner_x = global_x - local_x - halo;
+  int buffer_upperleft_corner_y = global_y - local_y - halo;
+
+  // coordinates in the buffer
+  int buffer_x = local_x + halo;
+  int buffer_y = local_y + halo;
+
+  // buffer index
+  int local_size = get_local_size(0);
+  int buffer_1D_index = local_x + (local_y * local_size);
+
+  // load into buffer
+  if (buffer_1D_index < buf_w)
+    for (int line = 0; line < buf_h; line++)
+      {
+      buffer[buffer_1D_index + (line * buf_w)] = get_closest(in_values, buffer_upperleft_corner_x + buffer_1D_index, buffer_upperleft_corner_y + line, w, h);
+      }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // new buffer values
+  float buffer_1 = buffer[(buffer_x - 1) + buf_w * (buffer_y - 1)];
+  float buffer_2 = buffer[buffer_x + (buf_w * (buffer_y - 1))];
+  float buffer_3 = buffer[(buffer_x + 1) + (buf_w * (buffer_y - 1))];
+  float buffer_4 = buffer[(buffer_x - 1) + (buf_w * buffer_y)];
+  float buffer_5 = buffer[buffer_x + (buf_w * buffer_y)];
+  float buffer_6 = buffer[(buffer_x + 1) + (buf_w * buffer_y)];
+  float buffer_7 = buffer[(buffer_x - 1) + (buf_w * (buffer_y + 1))];
+  float buffer_8 = buffer[buffer_x + (buf_w * (buffer_y + 1))];
+  float buffer_9 = buffer[(buffer_x + 1) + buf_w * (buffer_y + 1)];                                  
+
+  // output
+  if ((global_x < w) && (global_y < h))
+    out_values[global_x + (global_y * w)] = median9(buffer_1, buffer_2, buffer_3, buffer_4, buffer_5, buffer_6, buffer_7, buffer_8, buffer_9);
+
+}
+
+// if value out of bounds return nearest inbound value
+float get_closest(__global __read_only float *in_values, int idx_x, int idx_y, int w, int h)
+{
+  if (idx_x <= 0)
+    idx_x = 0;
+  if (idx_x >= w)
+    idx_x = w - 1;
+  if (idx_y <= 0)
+    idx_y = 0;
+  if (idx_y >= h)
+    idx_y = h - 1;
+
+  return in_values[(idx_y * w) + idx_x];
+}
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
new file mode 100755
index 00000000..a181c05a
--- /dev/null
+++ b/HW3/P4/median_filter.py
@@ -0,0 +1,91 @@
+from __future__ import division
+import pyopencl as cl
+import numpy as np
+import pylab
+import os.path
+
+def round_up(global_size, group_size):
+    r = global_size % group_size
+    if r == 0:
+        return global_size
+    return global_size + group_size - r
+
+def numpy_median(image, iterations=10):
+    ''' filter using numpy '''
+    for i in range(iterations):
+        padded = np.pad(image, 1, mode='edge')
+        stacked = np.dstack((padded[:-2,  :-2], padded[:-2,  1:-1], padded[:-2,  2:],
+                             padded[1:-1, :-2], padded[1:-1, 1:-1], padded[1:-1, 2:],
+                             padded[2:,   :-2], padded[2:,   1:-1], padded[2:,   2:]))
+        image = np.median(stacked, axis=2)
+
+    return image
+
+if __name__ == '__main__':
+    # List our platforms
+    platforms = cl.get_platforms()
+    print 'The platforms detected are:'
+    print '---------------------------'
+    for platform in platforms:
+        print platform.name, platform.vendor, 'version:', platform.version
+
+    # List devices in each platform
+    for platform in platforms:
+        print 'The devices detected on platform', platform.name, 'are:'
+        print '---------------------------'
+        for device in platform.get_devices():
+            print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
+            print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
+            print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
+            print 'Maximum work group size', device.max_work_group_size
+            print '---------------------------'
+
+    # Create a context with all the devices
+    devices = platforms[0].get_devices()
+    context = cl.Context(devices)
+    print 'This context is associated with ', len(context.devices), 'devices'
+
+    # Create a queue for transferring data and launching computations.
+    # Turn on profiling to allow us to check event times.
+    queue = cl.CommandQueue(context, context.devices[0],
+                            properties=cl.command_queue_properties.PROFILING_ENABLE)
+    print 'The queue is using the device:', queue.device.name
+
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
+
+    host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
+    host_image_filtered = np.zeros_like(host_image)
+
+    gpu_image_a = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
+    gpu_image_b = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
+
+    local_size = (8, 8)  # 64 pixels per work group
+    global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
+    width = np.int32(host_image.shape[1])
+    height = np.int32(host_image.shape[0])
+
+    # Set up a (N+2 x N+2) local memory buffer.
+    # +2 for 1-pixel halo on all sides, 4 bytes for float.
+    local_memory = cl.LocalMemory(4 * (local_size[0] + 2) * (local_size[1] + 2))
+    # Each work group will have its own private buffer.
+    buf_width = np.int32(local_size[0] + 2)
+    buf_height = np.int32(local_size[1] + 2)
+    halo = np.int32(1)
+
+    # Send image to the device, non-blocking
+    cl.enqueue_copy(queue, gpu_image_a, host_image, is_blocking=False)
+
+    num_iters = 10
+    for iter in range(num_iters):
+        program.median_3x3(queue, global_size, local_size,
+                           gpu_image_a, gpu_image_b, local_memory,
+                           width, height,
+                           buf_width, buf_height, halo)
+
+        # swap filtering direction
+        gpu_image_a, gpu_image_b = gpu_image_b, gpu_image_a
+
+    cl.enqueue_copy(queue, host_image_filtered, gpu_image_a, is_blocking=True)
+
+    assert np.allclose(host_image_filtered, numpy_median(host_image, num_iters))
diff --git a/HW3/P5/.DS_Store b/HW3/P5/.DS_Store
new file mode 100644
index 00000000..60c66e9b
Binary files /dev/null and b/HW3/P5/.DS_Store differ
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
new file mode 100644
index 00000000..3a16f7ef
--- /dev/null
+++ b/HW3/P5/P5.txt
@@ -0,0 +1,39 @@
+maze1
+
+Part 1:
+Finished after 877 iterations, 1440.750688 ms total, 1.6428172041 ms per iteration
+Found 2 regions
+
+Part 2:
+Finished after 498 iterations, 803.04366 ms total, 1.6125374873 ms per iteration
+Found 2 regions
+
+Part 3:
+Finished after 10 iterations, 16.68932 ms total, 1.6689321632 ms per iteration
+Found 2 regions
+
+Part 4:
+Finished after 10 iterations, 67.087168 ms total, 6.7087168 ms per iteration
+Found 2 regions
+
+maze2
+
+Part 1:
+Finished after 507 iterations, 829.065664 ms total, 1.63523799606 ms per iteration
+Found 35 regions
+
+Part 2:
+Finished after 261 iterations, 422.07181 ms total, 1.617133374853 ms per iteration
+Found 35 regions
+
+Part 3:
+Finished after 9 iterations, 14.77993 ms total, 1.642214345484 ms per iteration
+Found 35 regions
+
+Part 4:
+Finished after 9 iterations, 55.402368 ms total, 6.15581866667 ms per iteration
+Found 35 regions
+
+In part 4 there is a very strong slow down. Using a single thread is clearly not a good choice on my system. Using multiple threads however increases access rates to the global memory. On my system the execution seems to be strongly compute bound rather than I/O bound so the speed up due to parallelism makes up for the slow down due to more golbal memory access.
+
+If min() instead of atomic_min() is used the algorithm will still finish correctly. The performace of the algorithm should not change significantly. Using min() would allow each iteration to be finish faster since there is no forced serialization. However, the number of iteration might increase because two or more threads might try to read and write a value at the same time, result in an incorrect value that needs to be corrected by running additional iterations.
\ No newline at end of file
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
new file mode 100755
index 00000000..d2394646
--- /dev/null
+++ b/HW3/P5/label_regions.cl
@@ -0,0 +1,114 @@
+__kernel void
+initialize_labels(__global __read_only int *image,
+                  __global __write_only int *labels,
+                  int w, int h)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    if ((x < w) && (y < h)) {
+        if (image[y * w + x] > 0) {
+            // set each pixel > 0 to its linear index
+            labels[y * w + x] = y * w + x;
+        } else {
+            // out of bounds, set to maximum
+            labels[y * w + x] = w * h;
+        }
+    }
+}
+
+int
+get_clamped_value(__global __read_only int *labels,
+                  int w, int h,
+                  int x, int y)
+{
+    if ((x < 0) || (x >= w) || (y < 0) || (y >= h))
+        return w * h;
+    return labels[y * w + x];
+}
+
+__kernel void
+propagate_labels(__global __read_write int *labels,
+                 __global __write_only int *changed_flag,
+                 __local int *buffer,
+                 int w, int h,
+                 int buf_w, int buf_h,
+                 const int halo)
+{
+    // halo is the additional number of cells in one direction
+
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+    
+    int old_label;
+    // Will store the output value
+    int new_label;
+    
+    // Load the relevant labels to a local buffer with a halo 
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = 
+                get_clamped_value(labels,
+                                  w, h,
+                                  buf_corner_x + idx_1D, buf_corner_y + row);
+        }
+    }
+
+    // Make sure all threads reach the next part after
+    // the local buffer is loaded
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Fetch the value from the buffer the corresponds to
+    // the pixel for this thread
+    old_label = buffer[buf_y * buf_w + buf_x];
+
+    // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
+    if(idx_1D == 0){
+        for (int i = 0; i < buf_w * buf_h; i++) {
+            if (buffer[i] < w * h)
+                buffer[i] = labels[buffer[i]];
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    // stay in bounds
+    if ((x < w) && (y < h)) {
+        // CODE FOR PART 1 HERE
+        // We set new_label to the value of old_label, but you will need
+        // to adjust this for correctness.
+        new_label = old_label;
+        if (new_label < w * h) {
+            new_label = min(new_label, 
+                        min(buffer[buf_w * (buf_y + 1) + buf_x], 
+                        min(buffer[buf_w * (buf_y - 1) + buf_x], 
+                        min(buffer[buf_w * buf_y + buf_x + 1], buffer[buf_w * buf_y + buf_x - 1]))));
+        }
+
+        if (new_label != old_label) {
+            // CODE FOR PART 3 HERE
+            // indicate there was a change this iteration.
+            // multiple threads might write this.
+            *(changed_flag) += 1;
+            labels[y * w + x] = new_label;
+            atomic_min(&labels[old_label], new_label);
+        }
+    }
+}
diff --git a/HW3/P5/label_regions.py b/HW3/P5/label_regions.py
new file mode 100755
index 00000000..b9aeeeb9
--- /dev/null
+++ b/HW3/P5/label_regions.py
@@ -0,0 +1,121 @@
+from __future__ import division
+import sys
+import pyopencl as cl
+import numpy as np
+import pylab
+
+def round_up(global_size, group_size):
+    r = global_size % group_size
+    if r == 0:
+        return global_size
+    return global_size + group_size - r
+
+if __name__ == '__main__':
+    # List our platforms
+    platforms = cl.get_platforms()
+    print 'The platforms detected are:'
+    print '---------------------------'
+    for platform in platforms:
+        print platform.name, platform.vendor, 'version:', platform.version
+
+    # List devices in each platform
+    for platform in platforms:
+        print 'The devices detected on platform', platform.name, 'are:'
+        print '---------------------------'
+        for device in platform.get_devices():
+            print device.name, '[Type:', cl.device_type.to_string(device.type), ']'
+            print 'Maximum clock Frequency:', device.max_clock_frequency, 'MHz'
+            print 'Maximum allocable memory size:', int(device.max_mem_alloc_size / 1e6), 'MB'
+            print 'Maximum work group size', device.max_work_group_size
+            print '---------------------------'
+
+    # Create a context with all the devices
+    devices = platforms[0].get_devices()
+    context = cl.Context(devices)
+    print 'This context is associated with ', len(context.devices), 'devices'
+
+    # Create a queue for transferring data and launching computations.
+    # Turn on profiling to allow us to check event times.
+    queue = cl.CommandQueue(context, context.devices[0],
+                            properties=cl.command_queue_properties.PROFILING_ENABLE)
+    print 'The queue is using the device:', queue.device.name
+
+    program = cl.Program(context, open('label_regions.cl').read()).build(options='')
+
+    host_image = np.load('maze2.npy')
+    host_labels = np.empty_like(host_image)
+    host_done_flag = np.zeros(1).astype(np.int32)
+
+    gpu_image = cl.Buffer(context, cl.mem_flags.READ_ONLY, host_image.size * 4)
+    gpu_labels = cl.Buffer(context, cl.mem_flags.READ_WRITE, host_image.size * 4)
+    gpu_done_flag = cl.Buffer(context, cl.mem_flags.READ_WRITE, 4)
+
+    # Send to the device, non-blocking
+    cl.enqueue_copy(queue, gpu_image, host_image, is_blocking=False)
+
+    local_size = (8, 8)  # 64 pixels per work group
+    global_size = tuple([round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
+    print global_size
+    width = np.int32(host_image.shape[1])
+    height = np.int32(host_image.shape[0])
+    halo = np.int32(1)
+
+    # Create a local memory per working group that is
+    # the size of an int (4 bytes) * (N+2) * (N+2), where N is the local_size
+    buf_size = (np.int32(local_size[0] + 2 * halo), np.int32(local_size[1] + 2 * halo))
+    gpu_local_memory = cl.LocalMemory(4 * buf_size[0] * buf_size[1])
+
+    # initialize labels
+    program.initialize_labels(queue, global_size, local_size,
+                              gpu_image, gpu_labels,
+                              width, height)
+
+    # while not done, propagate labels
+    itercount = 0
+
+    # Show the initial labels
+    cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
+    pylab.imshow(host_labels)
+    pylab.title(itercount)
+    pylab.show()
+
+    show_progress = True
+    total_time = 0
+
+    while True:
+        itercount += 1
+        host_done_flag[0] = 0
+        print 'iter', itercount
+        cl.enqueue_copy(queue, gpu_done_flag, host_done_flag, is_blocking=False)
+        prop_exec = program.propagate_labels(queue, global_size, local_size,
+                                             gpu_labels, gpu_done_flag,
+                                             gpu_local_memory,
+                                             width, height,
+                                             buf_size[0], buf_size[1],
+                                             halo)
+        prop_exec.wait()
+        elapsed = 1e-6 * (prop_exec.profile.end - prop_exec.profile.start)
+        total_time += elapsed
+        # read back done flag, block until it gets here
+        cl.enqueue_copy(queue, host_done_flag, gpu_done_flag, is_blocking=True)
+        if host_done_flag[0] == 0:
+            # no changes
+            break
+        # there were changes, so continue running
+        print host_done_flag
+        if itercount % 100 == 0 and show_progress:
+            cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
+            pylab.imshow(host_labels)
+            pylab.title(itercount)
+            pylab.show()
+        if itercount % 10000 == 0:
+            print 'Reached maximal number of iterations, aborting'
+            sys.exit(0)
+
+    print('Finished after {} iterations, {} ms total, {} ms per iteration'.format(itercount, total_time, total_time / itercount))
+    # Show final result
+    cl.enqueue_copy(queue, host_labels, gpu_labels, is_blocking=True)
+    print 'Found {} regions'.format(len(np.unique(host_labels)) - 1)
+    pylab.imshow(host_labels)
+    pylab.title(itercount)
+    pylab.show()
diff --git a/HW3/P5/maze1.npy b/HW3/P5/maze1.npy
new file mode 100755
index 00000000..fab3e2c0
Binary files /dev/null and b/HW3/P5/maze1.npy differ
diff --git a/HW3/P5/maze2.npy b/HW3/P5/maze2.npy
new file mode 100755
index 00000000..4f8ab96f
Binary files /dev/null and b/HW3/P5/maze2.npy differ