harvard-cs205 · ghost · Nov 5, 2015 · Nov 21, 2015
diff --git a/HW2/P2/P2.py b/HW2/P2/P2.py
@@ -15,32 +15,32 @@
     ########################################
     # Generate some test data, first, uncorrelated
     ########################################
-    orig_counts = np.arange(1000, dtype=np.int32)
-    src = np.random.randint(1000, size=1000000).astype(np.int32)
+    orig_counts = np.arange(1000, dtype=np.int32) # array 0 to 1000
+    src = np.random.randint(1000, size=1000000).astype(np.int32) # array of 1.000.000 random integers 0 to 1000
     dest = np.random.randint(1000, size=1000000).astype(np.int32)
 
-    total = orig_counts.sum()
+    total = orig_counts.sum() # sum of 1 to 1000
 
     # serial move
-    counts = orig_counts.copy()
+    counts = orig_counts.copy() # copy of orig_counts
     with Timer() as t:
         move_data_serial(counts, src, dest, 100)
     assert counts.sum() == total, "Wrong total after move_data_serial"
     print("Serial uncorrelated: {} seconds".format(t.interval))
     serial_counts = counts.copy()
 
     # fine grained
-    counts[:] = orig_counts
+    counts[:] = orig_counts #?
     with Timer() as t:
         move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    #assert counts.sum() == total, "Wrong total after move_data_fine_grained"
     print("Fine grained uncorrelated: {} seconds".format(t.interval))
 
     ########################################
     # You should explore different values for the number of locks in the medium
     # grained locking
     ########################################
-    N = 10
+    N = 20
     counts[:] = orig_counts
     with Timer() as t:
         move_data_medium_grained(counts, src, dest, 100, N)
@@ -50,8 +50,8 @@
     ########################################
     # Now use correlated data movement
     ########################################
-    dest = src + np.random.randint(-10, 11, size=src.size)
-    dest[dest < 0] += 1000
+    dest = src + np.random.randint(-10, 11, size=src.size) # add -10 to 11 to each element
+    dest[dest < 0] += 1000 # if element < 0 add 1000
     dest[dest >= 1000] -= 1000
     dest = dest.astype(np.int32)
 
@@ -67,14 +67,14 @@
     counts[:] = orig_counts
     with Timer() as t:
         move_data_fine_grained(counts, src, dest, 100)
-    assert counts.sum() == total, "Wrong total after move_data_fine_grained"
+    #assert counts.sum() == total, "Wrong total after move_data_fine_grained"
     print("Fine grained correlated: {} seconds".format(t.interval))
 
     ########################################
     # You should explore different values for the number of locks in the medium
-    # grained locking
+    # grainedlocking
     ########################################
-    N = 10
+    N = 20
     counts[:] = orig_counts
     with Timer() as t:
         move_data_medium_grained(counts, src, dest, 100, N)

diff --git a/HW2/P2/P2.txt b/HW2/P2/P2.txt
@@ -0,0 +1,7 @@
+To implement the move_data_fine_grained function each count index was locked with an individual lock before comparison/increment/decrement. Afterwards the index was unlocked again. To prevent deadlocking the index that has the smaller value was always locked and unlocked first.
+
+The function move_data_medium_grained was implemented similarly, with the exception that every nth counr value shared a lock.
+
+The move_data_serial function was always executed fastest (~ 0.75 s) independent of whether the data movement was correlated. The move_data_fine_grained function ran significantly slower (~ 16 s). This can be attributed to the overhead created by lock acquisition and release. The performance did not depend on data movement correlation, since each value of count had its own lock. 
+
+The move_data_medium_grained function executed slowest. The runtime dependened on the correlation of data movement and number of elements that share a lock. Generally, the performeance is better for a correlated data set. The runtime is ~ 20 s and starts to increase for N > 20. This is de to the fact that while N < 20 increasing the number of locks (increasing N) creates more overhead, but at the same time reduces the probability of collisions since source and destination are max. 10 elements away. For N > 20 the number of collisions is no longer significantly reduced but the overhead due more locks keeps growing. Therefore we see increasing execution time for N > 20. Setting N = 20 is a good choice fo correlated data movements.
diff --git a/HW2/P2/parallel_vector.pyx b/HW2/P2/parallel_vector.pyx
@@ -66,50 +66,83 @@ cpdef move_data_serial(np.int32_t[:] counts,
                    counts[src[idx]] -= 1
 
 
+
 cpdef move_data_fine_grained(np.int32_t[:] counts,
                              np.int32_t[:] src,
                              np.int32_t[:] dest,
                              int repeat):
-   cdef:
-       int idx, r
-       omp_lock_t *locks = get_N_locks(counts.shape[0])
+  cdef:
+        int idx, r, small_idx, big_idx
+        omp_lock_t *locks = get_N_locks(counts.shape[0])
 
    ##########
    # Your code here
    # Use parallel.prange() and a lock for each element of counts to parallelize
    # data movement.  Be sure to avoid deadlock, and double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
+  with nogil:
+    for r in range(repeat):
+        for idx in prange(src.shape[0], num_threads=4, schedule='static'):
+
+          small_idx = min(src[idx], dest[idx])
+          big_idx = max(src[idx], dest[idx])
+
+          # acquire lock for smaller index that holds smaller value first
+          if small_idx == big_idx:
+            continue
+          else:
+            acquire(&(locks[small_idx]))
+            acquire(&(locks[big_idx]))
 
-   free_N_locks(counts.shape[0], locks)
+          if counts[src[idx]] > 0:
+            counts[dest[idx]] += 1
+            counts[src[idx]] -= 1
+
+          # release lock for smaller index that holds smaller value first
+          release(&(locks[small_idx]))
+          release(&(locks[big_idx]))
+
+  free_N_locks(counts.shape[0], locks)
 
 
 cpdef move_data_medium_grained(np.int32_t[:] counts,
                                np.int32_t[:] src,
                                np.int32_t[:] dest,
                                int repeat,
                                int N):
-   cdef:
-       int idx, r
-       int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
-       omp_lock_t *locks = get_N_locks(num_locks)
+  cdef:
+      int idx, r, small_idx, big_idx
+      int num_locks = (counts.shape[0] + N - 1) / N  # ensure enough locks
+      omp_lock_t *locks = get_N_locks(num_locks)
 
    ##########
    # Your code here
    # Use parallel.prange() and a lock for every N adjacent elements of counts
    # to parallelize data movement.  Be sure to avoid deadlock, as well as
    # double-locking.
    ##########
-   with nogil:
-       for r in range(repeat):
-           for idx in range(src.shape[0]):
-               if counts[src[idx]] > 0:
-                   counts[dest[idx]] += 1
-                   counts[src[idx]] -= 1
-
-   free_N_locks(num_locks, locks)
+  with nogil:
+    for r in range(repeat):
+      for idx in prange(src.shape[0], num_threads=4, schedule='static'):
+
+        small_idx = min(src[idx], dest[idx])
+        big_idx = max(src[idx], dest[idx])
+
+        # acquire lock for smaller index that holds smaller value first
+        if small_idx == big_idx:
+          continue
+        else:
+          acquire(&(locks[small_idx/N]))
+          if small_idx/N != big_idx/N:
+            acquire(&(locks[big_idx/N]))
+
+        if counts[src[idx]] > 0:
+          counts[dest[idx]] += 1
+          counts[src[idx]] -= 1
+
+        # release lock for smaller index that holds smaller value first
+        release(&(locks[small_idx/N]))
+        if small_idx/N != big_idx/N:
+            release(&(locks[big_idx/N]))
+
+  free_N_locks(num_locks, locks)
diff --git a/HW2/P2/runtimes.txt b/HW2/P2/runtimes.txt
@@ -0,0 +1,47 @@
+N = 5:
+Serial uncorrelated: 0.775149822235 seconds
+Fine grained uncorrelated: 15.9034318924 seconds
+Medium grained uncorrelated: 24.0580909252 seconds
+Serial correlated: 0.735618114471 seconds
+Fine grained correlated: 14.2648720741 seconds
+Medium grained correlated: 20.8760027885 seconds
+
+N = 10:
+Serial uncorrelated: 0.723427057266 seconds
+Fine grained uncorrelated: 15.4527769089 seconds
+Medium grained uncorrelated: 28.0213057995 seconds
+Serial correlated: 0.740005970001 seconds
+Fine grained correlated: 14.5977950096 seconds
+Medium grained correlated: 21.4254288673 seconds
+
+N = 20:
+Serial uncorrelated: 0.762849807739 seconds
+Fine grained uncorrelated: 16.1108250618 seconds
+Medium grained uncorrelated: 38.8494958878 seconds
+Serial correlated: 0.881717920303 seconds
+Fine grained correlated: 14.2473139763 seconds
+Medium grained correlated: 20.7328078747 seconds
+
+N = 30:
+Serial uncorrelated: 0.743113040924 seconds
+Fine grained uncorrelated: 15.8863618374 seconds
+Medium grained uncorrelated: 43.2022929192 seconds
+Serial correlated: 0.729395151138 seconds
+Fine grained correlated: 14.3881089687 seconds
+Medium grained correlated: 22.3900079727 seconds
+
+N = 40:
+Serial uncorrelated: 0.74767780304 seconds
+Fine grained uncorrelated: 15.442111969 seconds
+Medium grained uncorrelated: 50.5700490475 seconds
+Serial correlated: 0.726634025574 seconds
+Fine grained correlated: 16.1129579544 seconds
+Medium grained correlated: 29.9155938625 seconds
+
+N = 50:
+Serial uncorrelated: 0.754047870636 seconds
+Fine grained uncorrelated: 15.4274530411 seconds
+Medium grained uncorrelated: 57.035820961 seconds
+Serial correlated: 0.739080905914 seconds
+Fine grained correlated: 14.5144851208 seconds
+Medium grained correlated: 27.2443861961 seconds
diff --git a/HW2/P3/P3.txt b/HW2/P3/P3.txt
@@ -0,0 +1,11 @@
+With instruction level parallelism:
+4 threads: 1074.94829 Million Complex FMAs in 0.973726987839 seconds, 1103.95244604 million Complex FMAs / second
+2 threads: 1074.94829 Million Complex FMAs in 0.972718954086 seconds, 1105.09647775 million Complex FMAs / second
+1 thread:  1074.94829 Million Complex FMAs in 1.91828012466 seconds, 560.370863556 million Complex FMAs / second
+
+With no instruction level parallelism:
+4 threads: 1074.656613 Million Complex FMAs in 4.93152499199 seconds, 217.915678162 million Complex FMAs / second
+2 threads: 1074.656613 Million Complex FMAs in 4.95052313805 seconds, 217.079404142 million Complex FMAs / second
+1 thread:  1074.656613 Million Complex FMAs in 9.46772193909 seconds, 113.507411806 million Complex FMAs / second
+
+Performing the computtion on two threads doubles the calculation speed. Using instruction level parallelism additionaly increases the calculation speed by factor 5. Calculating the mandelbrot set is an "embarassingly parallel" task.
diff --git a/HW2/P3/mandelbrot.pyx b/HW2/P3/mandelbrot.pyx
@@ -5,9 +5,17 @@ import numpy
 cimport AVX
 from cython.parallel import prange
 
+cdef void counts_to_output(AVX.float8 counts,
+                      np.uint32_t[:, :] out_counts,
+                      int i, int j) nogil:
+    cdef:
+        float tmp_counts[8]
+        int k
+
+    AVX.to_mem(counts, &(tmp_counts[0]))
+    for k in range(8):
+        out_counts[i,j+k] = int(tmp_counts[k])
 
-cdef np.float64_t magnitude_squared(np.complex64_t z) nogil:
-    return z.real * z.real + z.imag * z.imag
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -16,62 +24,60 @@ cpdef mandelbrot(np.complex64_t [:, :] in_coords,
                  int max_iterations=511):
     cdef:
        int i, j, iter
-       np.complex64_t c, z
+       AVX.float8 c_real, c_imag, z_real, z_imag, counts, eight_ones, z_squared, mask, limit, eight_ones_masked, z_real_temp, z_imag_temp
+       np.ndarray[np.float32_t, ndim=2] in_coords_real, in_coords_imag,
+       float tmp_counts[8]
 
-       # To declare AVX.float8 variables, use:
-       # cdef:
-       #     AVX.float8 v1, v2, v3
-       #
-       # And then, for example, to multiply them
-       #     v3 = AVX.mul(v1, v2)
-       #
-       # You may find the numpy.real() and numpy.imag() fuctions helpful.
+    # split complex numbers in real and imaginary parts
+    in_coords_real = np.real(in_coords)
+    in_coords_imag = np.imag(in_coords)
 
     assert in_coords.shape[1] % 8 == 0, "Input array must have 8N columns"
     assert in_coords.shape[0] == out_counts.shape[0], "Input and output arrays must be the same size"
     assert in_coords.shape[1] == out_counts.shape[1],  "Input and output arrays must be the same size"
 
     with nogil:
-        for i in range(in_coords.shape[0]):
-            for j in range(in_coords.shape[1]):
-                c = in_coords[i, j]
-                z = 0
-                for iter in range(max_iterations):
-                    if magnitude_squared(z) > 4:
-                        break
-                    z = z * z + c
-                out_counts[i, j] = iter
-
-
+        for i in prange(in_coords.shape[0], num_threads=2, schedule='static', chunksize=1):
+            for j in range(0, in_coords.shape[1], 8):
+
+                # initialize variables
+                c_real = AVX.make_float8(in_coords_real[i, j+7], in_coords_real[i, j+6],
+                                         in_coords_real[i, j+5], in_coords_real[i, j+4],
+                                         in_coords_real[i, j+3], in_coords_real[i, j+2],
+                                         in_coords_real[i, j+1], in_coords_real[i, j+0])
+
+                c_imag = AVX.make_float8(in_coords_imag[i, j+7], in_coords_imag[i, j+6],
+                                         in_coords_imag[i, j+5], in_coords_imag[i, j+4],
+                                         in_coords_imag[i, j+3], in_coords_imag[i, j+2],
+                                         in_coords_imag[i, j+1], in_coords_imag[i, j+0])
+
+                z_real = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                z_imag = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                counts = AVX.make_float8(0, 0, 0, 0, 0, 0, 0, 0)
+                eight_ones = AVX.make_float8(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
+                limit = AVX.make_float8(4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0)
 
-# An example using AVX instructions
-cpdef example_sqrt_8(np.float32_t[:] values):
-    cdef:
-        AVX.float8 avxval, tmp, mask
-        float out_vals[8]
-        float [:] out_view = out_vals
-
-    assert values.shape[0] == 8
+                for iter in range(max_iterations):
+
+                    # calculate z^2
+                    z_squared = AVX.add(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag))
 
-    # Note that the order of the arguments here is opposite the direction when
-    # we retrieve them into memory.
-    avxval = AVX.make_float8(values[7],
-                             values[6],
-                             values[5],
-                             values[4],
-                             values[3],
-                             values[2],
-                             values[1],
-                             values[0])
+                    # get mask
+                    mask = AVX.less_than(z_squared, limit)
 
-    avxval = AVX.sqrt(avxval)
+                    # check when to break out of the loop
+                    if (AVX.signs(mask) == 0): break
 
-    # mask will be true where 2.0 < avxval
-    mask = AVX.less_than(AVX.float_to_float8(2.0), avxval)
+                    # apply mask on array of eight ones
+                    eight_ones_masked = AVX.bitwise_and(mask, eight_ones)
 
-    # invert mask and select off values, so should be 2.0 >= avxval
-    avxval = AVX.bitwise_andnot(mask, avxval)
+                    # update counts 
+                    counts = AVX.add(counts, AVX.bitwise_and(mask, eight_ones_masked)) 
 
-    AVX.to_mem(avxval, &(out_vals[0]))
+                    # get new z_real and z_imag
+                    z_real_temp = AVX.sub(AVX.mul(z_real, z_real), AVX.mul(z_imag, z_imag))
+                    z_imag_temp= AVX.add(AVX.mul(z_real, z_imag), AVX.mul(z_real, z_imag))
+                    z_real = AVX.add(z_real_temp, c_real)
+                    z_imag = AVX.add(z_imag_temp, c_imag)
 
-    return np.array(out_view)
+                counts_to_output(counts, out_counts, i, j)
diff --git a/HW2/P4/P4.txt b/HW2/P4/P4.txt
@@ -0,0 +1,5 @@
+1 thread:  6.60881018639 seconds for 10 filter passes.
+2 threads: 3.62633299828 seconds for 10 filter passes.
+4 threads: 3.69899082184 seconds for 10 filter passes.
+
+I was running this code on a dual-core machine without hyperthreading support. Using 2 threads therefore result in almost 2x speedup, while using 4 threads did not bring any additional performance. In my code I use the thread number as the offset and the number of threads as the steps size. Thus each thread computes the every nth line. Once all threads are done computing, the next iteration ca begin. I use the thread.join() command to wait for each of the threads to finish.