char* sequential_game_of_life (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ const int LDA = nrows; int curgen, i, j; printf("%d\n", 2); for (curgen = 0; curgen < gens_max; curgen++) { /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ for (j = 0; j < ncols; j++) { for (i = 0; i < nrows; i++) { const int inorth = mod (i-1, nrows); const int isouth = mod (i+1, nrows); const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); // printf("jwest-%d, jeast-%d, inorth-%d, isouth-%d\n", jwest,jeast,inorth,isouth); const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); // printf("%d\n",neighbor_count); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } SWAP_BOARDS( outboard, inboard ); } printf("%d\n", 3); /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
char* sequential_game_of_life (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ // parallelization happens here pthread_t tid[NUM_THREADS]; t_args_t args[NUM_THREADS*2]; int curgen, i; // set up static args for threads 8x1 /*(for (i = 0; i < NUM_THREADS; i++) { args[i].nrows = nrows; args[i].ncols = ncols; args[i].rStart = (nrows/(NUM_THREADS*2)) * (i); args[i].rEnd = (nrows/(NUM_THREADS*2)) * (i+1); args[i].cStart = 0; args[i].cEnd = ncols; } for (i = 0; i < NUM_THREADS; i++) { args[NUM_THREADS+i].nrows = nrows; args[NUM_THREADS+i].ncols = ncols; args[NUM_THREADS+i].rStart = (nrows/(NUM_THREADS*2)) * (NUM_THREADS+i); args[NUM_THREADS+i].rEnd = (nrows/(NUM_THREADS*2)) * (NUM_THREADS+i+1); args[NUM_THREADS+i].cStart = 0; args[NUM_THREADS+i].cEnd = ncols; }*/ // 4x2 for (i = 0; i < NUM_THREADS; i++) { args[i].nrows = nrows; args[i].ncols = ncols; args[i].rStart = (nrows/(NUM_THREADS)) * (i%2); args[i].rEnd = (nrows/(NUM_THREADS)) * ((i%2)+1); args[i].cStart = (ncols/2) * (i/2); args[i].cEnd = (ncols/2) * ((i/2)+1); } for (i = 0; i < NUM_THREADS; i++) { args[NUM_THREADS+i].nrows = nrows; args[NUM_THREADS+i].ncols = ncols; args[NUM_THREADS+i].rStart = (nrows/(NUM_THREADS)) * (NUM_THREADS/2+(i%2)); args[NUM_THREADS+i].rEnd = (nrows/(NUM_THREADS)) * (NUM_THREADS/2+(i%2)+1); args[NUM_THREADS+i].cStart = (ncols/2) * (i/2); args[NUM_THREADS+i].cEnd = (ncols/2) * ((i/2)+1); } for (curgen = 0; curgen < gens_max; curgen++) { /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ for (i = 0; i < NUM_THREADS; i++) { args[i].outboard = outboard; args[i].inboard = inboard; pthread_create(&tid[i], NULL, thread, &args[i]); } for (i = 0; i < NUM_THREADS; i++) { pthread_join(tid[i], NULL); } for (i = 0; i < NUM_THREADS; i++) { args[NUM_THREADS+i].outboard = outboard; args[NUM_THREADS+i].inboard = inboard; pthread_create(&tid[i], NULL, thread, &args[NUM_THREADS+i]); } for (i = 0; i < NUM_THREADS; i++) { pthread_join(tid[i], NULL); } SWAP_BOARDS( outboard, inboard ); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
/** * Parallelized implementation of the game of life */ void* loop_parellize(void* arg){ structArgs *a; a = (structArgs*) arg; int nrows = a->nrows; char* outboard = a->outboard; char* inboard = a->inboard; int threadNum = a->threadNum; int ncols = a->ncols; int gens_max = a->gens_max; int initial_i = threadNum*(nrows/NUM_THREADS); int maximum_i = initial_i + (nrows/NUM_THREADS); int i,j; const int LDA = nrows; int vari = nrows/NUM_THREADS; int varj = ncols/2; int j2,i2,curgen; int jself, jnw, jn, jne, jw, je, jsw, js, jse; int iself, inw, in, ine, iw, ie, isw, is, ise; for (curgen = 0; curgen < gens_max; curgen++) { // Optimization: loop switching j and i loops for (j = 0; j < ncols; j+=varj) { for (i = initial_i; i < maximum_i; i+=vari) { // Optimization: Code Motion, Improved formula for inorth and isouth const int inorth = (i==0) ? nrows-1 : i-1; const int isouth = (i==nrows-1) ? 0 : i+1; // Optimization: Tiling for(j2=j;j2<j+varj;j2++){ // Optimization: Improved formula for jwest and jeast const int jwest = (j2 == 0)? ncols-1: j2-1; const int jeast = (j2 == ncols-1)? 0 : j2+1; if(j2 == j){ // Optimization: Loop iteration memory sharing inw = jnw = BOARD (inboard, inorth, jwest); in = jn = BOARD (inboard, inorth, j2); ine = jne = BOARD (inboard, inorth, jeast); iw = jw = BOARD (inboard, i, jwest); iself = jself = BOARD (inboard, i, j2); ie = je = BOARD (inboard, i, jeast); isw = jsw = BOARD (inboard, isouth, jwest); is = js = BOARD (inboard, isouth, j2); ise = jse = BOARD (inboard, isouth, jeast); } else{ //Optimization: Loop iteration memory sharing inw = jnw = jn; in = jn = jne; ine = jne = BOARD (inboard, inorth, jeast); iw = jw = jself; iself = jself = je; ie = je = BOARD (inboard, i, jeast); isw = jsw = js; is = js = jse; ise = jse = BOARD (inboard, isouth, jeast); } for(i2=i; i2<i+vari;i2++){ // printf("jwest-%d, jeast-%d, inorth-%d, isouth-%d, ThreadNum-%d\n", jwest,jeast,inorth,isouth,threadNum); if(i2>i){ //Optimization: Loop iteration memory sharing const int isouth2 = (i2==nrows-1) ? 0 : i2+1; inw = iw; in = iself; ine = ie; iw = isw; iself = is; ie = ise; isw = BOARD (inboard, isouth2, jwest); is = BOARD (inboard, isouth2, j2); ise = BOARD (inboard, isouth2, jeast); } const char neighbor_count = inw + in + ine + iw + ie + isw + is + ise; // printf("%d\n", neighbor_count); BOARD(outboard, i2, j2) = alivep (neighbor_count, iself); } } } } // Optimizaton: pthread barrier pthread_barrier_wait(a->barrp); SWAP_BOARDS( outboard, inboard ); } pthread_exit(0); }
char* game_of_life (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ const int LDA = nrows; int curgen; for (curgen = 0; curgen < gens_max; curgen++) { /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ /** * Pragma directive invoking Open MP parallelization for the two nester for loops * Ensured that i and j declarations happen within the scope of the open MP * parallelization so that each thread have their own dedicated i and j variables * **/ #pragma omp parallel num_threads(NUM_THREADS) { int i, j; //Need these inside omp pragme so that they are not shared between threads int thread_num = omp_get_thread_num(); //Gets the current threads num identifier //Split the outer for loop equally between all threads in NUM_THREADS for (i = thread_num*nrows/NUM_THREADS ; i < (thread_num+1)*nrows/NUM_THREADS; i++) { const int inorth = mod (i-1, nrows); //LCIM - mod only uses 'i' value const int isouth = mod (i+1, nrows); //Declare all eight neighbours and current cell. //Compute, north, north-east, current, east, south, and south-west char nw; char n = BOARD (inboard, inorth, mod (-1, ncols)); char ne = BOARD (inboard, inorth, 0); char w; char c = BOARD (inboard, i, mod (-1, ncols)); char e = BOARD (inboard, i, 0); char sw; char s = BOARD (inboard, isouth, mod (-1, ncols)); char se = BOARD (inboard, isouth, 0); for (j = 0; j < ncols; j++) { const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); //Shift the neighbour values to the left //This enables us to save computation in each stride of j //Only need to compute three new values each stride: // north-east // east // south-east nw = n; n = ne; ne = BOARD (inboard, inorth, jeast); w = c; c = e; e = BOARD (inboard, i, jeast); sw = s; s = se; se = BOARD (inboard, isouth, jeast); const char neighbor_count = nw + n + ne + w + e + sw + s + se; BOARD(outboard, i, j) = alivep (neighbor_count, c); } } } SWAP_BOARDS( outboard, inboard ); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
void* parallel_game_of_life (void* arg) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ int inorth; int isouth; int jwest; int jeast; thread_struct *thread = (thread_struct *)arg; char* outboard = thread->outboard; char* inboard = thread->inboard; const int nrows = thread->nrows; const int ncols = thread->ncols; const int gens_max = thread->gens_max; pthread_barrier_t *bar = thread->bar; const int LDA = nrows; /** * dividing up the number of rows between the 4 threads */ int from = (thread->thread_num * nrows) / NUMBER_OF_THREADS; int to_row = ((thread->thread_num + 1) * nrows) / NUMBER_OF_THREADS; int curgen, i, j; /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ for (curgen = 0; curgen < gens_max; curgen++) { for (i = from; i < to_row; i++) { //Only use mod to calculate inorth and isouth if we're at the boundary if (i == 0 || i == nrows - 1) { inorth = mod (i-1, nrows); isouth = mod (i+1, nrows); } else { inorth = i-1; isouth = i+1; } for (j = 0; j < ncols; j++) { //Only use mod to calculate jwest and jeast if we're at the boundary if (j == 0 || j == ncols - 1) { jwest = mod (j-1, ncols); jeast = mod (j+1, ncols); } else { jwest = j-1; jeast = j+1; } const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } pthread_barrier_wait(bar); SWAP_BOARDS( outboard, inboard ); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return NULL; }
char* sequential_game_of_life (char* outboard_, char* inboard_, const int nrows_, const int ncols_, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ nrows = nrows_; ncols = ncols_; outboard = outboard_; inboard = inboard_; LDA = nrows; slice = (nrows / NUM_THREADS); mask = nrows - 1; pthread_t *thread = (pthread_t*)malloc(NUM_THREADS * sizeof(pthread_t)); if (nrows_ <= 32 && ncols_ <= 32) { int curgen, i, j; for (curgen = 0; curgen < gens_max; curgen++) { for (i = 0; i < nrows; i++) { for (j = 0; j < ncols; j++) { const int inorth = mod (i-1, nrows); const int isouth = mod (i+1, nrows); const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, i, jwest) + BOARD (inboard, isouth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jeast) + BOARD (inboard, inorth, jeast) + BOARD (inboard, isouth, j); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } SWAP_BOARDS( outboard, inboard ); } } else { int curgen, i; for (curgen = 0; curgen < gens_max; curgen++) { for (i = 0; i < NUM_THREADS; i++) pthread_create (&thread[i], NULL, parallel_run, (void*)i); for (i = 0; i < NUM_THREADS; i++) pthread_join (thread[i], NULL); SWAP_BOARDS( outboard, inboard ); } } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }