void *thread(void *arg) { int i, j; char* outboard = ((t_args_t *)arg)->outboard; char* inboard = ((t_args_t *)arg)->inboard; const int nrows = ((t_args_t *)arg)->nrows; const int ncols = ((t_args_t *)arg)->ncols; const int rStart = ((t_args_t *)arg)->rStart; const int rEnd = ((t_args_t *)arg)->rEnd; const int cStart = ((t_args_t *)arg)->cStart; const int cEnd = ((t_args_t *)arg)->cEnd; const int LDA = nrows; for (i = rStart; i < rEnd; i++) { for (j = cStart; j < cEnd; j++) { const int inorth = mod (i-1, nrows); const int isouth = mod (i+1, nrows); const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } pthread_exit(NULL); }
char* sequential_game_of_life (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ const int LDA = nrows; int curgen, i, j; printf("%d\n", 2); for (curgen = 0; curgen < gens_max; curgen++) { /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ for (j = 0; j < ncols; j++) { for (i = 0; i < nrows; i++) { const int inorth = mod (i-1, nrows); const int isouth = mod (i+1, nrows); const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); // printf("jwest-%d, jeast-%d, inorth-%d, isouth-%d\n", jwest,jeast,inorth,isouth); const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); // printf("%d\n",neighbor_count); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } SWAP_BOARDS( outboard, inboard ); } printf("%d\n", 3); /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
/* * This is used in the original code version. It changed the modulus function call * To if/else statements. Which were later removed in other versions */ static inline void update(int i, int j, char* outboard, char* inboard, const int nrows, const int ncols){ const int LDA = nrows; int inorth; int isouth; int jwest; int jeast; if(i == 0) inorth = nrows - 1; else inorth = i-1; if(i == nrows - 1) isouth = 0; else isouth = i+1; if(j == 0) jwest = ncols - 1; else jwest = j-1; if(j == ncols - 1) jeast = 0; else jeast = j+1; const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); }
void* parallel_run(void* args) { int n = (intptr_t) args; int rows_from = n * slice; int rows_to = rows_from + slice; int i, j, ii, jj, inorth, isouth, jwest, jeast; for (i = rows_from; i < rows_to; i ++) { for (j = 0; j < ncols; j ++) { //for (ii = i; ii < i + BLOCK_SIZE; ii++) { inorth = (i-1) & mask; isouth = (i+1) & mask; // for (jj = j; jj < j + BLOCK_SIZE; jj++) { jwest = (j-1) & mask; jeast = (j+1) & mask; const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, i, jwest) + BOARD (inboard, isouth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jeast) + BOARD (inboard, inorth, jeast) + BOARD (inboard, isouth, j); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); // } // } } } }
/** * Parallelized implementation of the game of life */ void* loop_parellize(void* arg){ structArgs *a; a = (structArgs*) arg; int nrows = a->nrows; char* outboard = a->outboard; char* inboard = a->inboard; int threadNum = a->threadNum; int ncols = a->ncols; int gens_max = a->gens_max; int initial_i = threadNum*(nrows/NUM_THREADS); int maximum_i = initial_i + (nrows/NUM_THREADS); int i,j; const int LDA = nrows; int vari = nrows/NUM_THREADS; int varj = ncols/2; int j2,i2,curgen; int jself, jnw, jn, jne, jw, je, jsw, js, jse; int iself, inw, in, ine, iw, ie, isw, is, ise; for (curgen = 0; curgen < gens_max; curgen++) { // Optimization: loop switching j and i loops for (j = 0; j < ncols; j+=varj) { for (i = initial_i; i < maximum_i; i+=vari) { // Optimization: Code Motion, Improved formula for inorth and isouth const int inorth = (i==0) ? nrows-1 : i-1; const int isouth = (i==nrows-1) ? 0 : i+1; // Optimization: Tiling for(j2=j;j2<j+varj;j2++){ // Optimization: Improved formula for jwest and jeast const int jwest = (j2 == 0)? ncols-1: j2-1; const int jeast = (j2 == ncols-1)? 0 : j2+1; if(j2 == j){ // Optimization: Loop iteration memory sharing inw = jnw = BOARD (inboard, inorth, jwest); in = jn = BOARD (inboard, inorth, j2); ine = jne = BOARD (inboard, inorth, jeast); iw = jw = BOARD (inboard, i, jwest); iself = jself = BOARD (inboard, i, j2); ie = je = BOARD (inboard, i, jeast); isw = jsw = BOARD (inboard, isouth, jwest); is = js = BOARD (inboard, isouth, j2); ise = jse = BOARD (inboard, isouth, jeast); } else{ //Optimization: Loop iteration memory sharing inw = jnw = jn; in = jn = jne; ine = jne = BOARD (inboard, inorth, jeast); iw = jw = jself; iself = jself = je; ie = je = BOARD (inboard, i, jeast); isw = jsw = js; is = js = jse; ise = jse = BOARD (inboard, isouth, jeast); } for(i2=i; i2<i+vari;i2++){ // printf("jwest-%d, jeast-%d, inorth-%d, isouth-%d, ThreadNum-%d\n", jwest,jeast,inorth,isouth,threadNum); if(i2>i){ //Optimization: Loop iteration memory sharing const int isouth2 = (i2==nrows-1) ? 0 : i2+1; inw = iw; in = iself; ine = ie; iw = isw; iself = is; ie = ise; isw = BOARD (inboard, isouth2, jwest); is = BOARD (inboard, isouth2, j2); ise = BOARD (inboard, isouth2, jeast); } const char neighbor_count = inw + in + ine + iw + ie + isw + is + ise; // printf("%d\n", neighbor_count); BOARD(outboard, i2, j2) = alivep (neighbor_count, iself); } } } } // Optimizaton: pthread barrier pthread_barrier_wait(a->barrp); SWAP_BOARDS( outboard, inboard ); } pthread_exit(0); }
char* sequential_game_of_life_parallel (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max, const int sector, int *status, pthread_mutex_t *mutex, pthread_cond_t *cv) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ int curgen, i, j; int row_start, col_start; int row_end, col_end; //Splitting what quadrant we work on. if(sector == 0 || sector == 2){ row_start = 1; row_end = nrows/2; } else{ row_start = nrows/2; row_end = nrows - 1; } if(sector == 0 || sector == 1){ col_start = 1; col_end = ncols/2; } else{ col_start = ncols/2; col_end = ncols - 1; } const int LDA = nrows; char mem_access[3]; char cent; for (curgen = 0; curgen < gens_max; curgen++) { char neighbor_count; //The overlapping sections if (sector == 0){ //j == 0 //i == 0 COUNT_AND_BOARD(inboard, outboard, neighbor_count, 0, 0, nrows - 1, 1, ncols - 1, 1); //j == 0 //i == 1 -> i == nrows/2 - 1 I_CODE_WITH_J(0, ncols - 1, 1); //j == 1 -> j == ncols/2 - 1 //i == 0 //J_CODE_WITH_I(0, nrows - 1, 1); for (j = col_start; j < col_end; j++) { COUNT_AND_BOARD(inboard, outboard, neighbor_count, 0, j, nrows - 1, 1, j - 1, j + 1); } } else if(sector == 1){ //j == 0 //i == nrows - 1 COUNT_AND_BOARD(inboard, outboard, neighbor_count, nrows - 1, 0, nrows - 2, 0, ncols - 1, 1); //j == 0 //i == nrows/2 -> i == nrows - 2 I_CODE_WITH_J(0, ncols - 1, 1); //j == 1 -> j == ncols/2 - 1 //i == nrows - 1 //J_CODE_WITH_I(nrows - 1, nrows - 2, 0); for (j = col_start; j < col_end; j++) { COUNT_AND_BOARD(inboard, outboard, neighbor_count, nrows - 1, j, nrows - 2, 0, j - 1, j + 1); } } else if(sector == 2){ //j == ncols - 1 //i == 0 COUNT_AND_BOARD(inboard, outboard, neighbor_count, 0, ncols - 1, nrows - 1, 1, ncols - 2, 0); //j == ncols - 1 //i == 1 -> i == nrows/2 - 1 I_CODE_WITH_J(ncols - 1, ncols - 2, 0); //j == ncols/2 -> j == ncols - 2 //i == 0 //J_CODE_WITH_I(0, nrows - 1, 1); for (j = col_start; j < col_end; j++) { COUNT_AND_BOARD(inboard, outboard, neighbor_count, 0, j, nrows - 1, 1, j - 1, j + 1); } } else{ //j == ncols - 1 //i == nrows - 1 COUNT_AND_BOARD(inboard, outboard, neighbor_count, nrows - 1, ncols - 1, nrows - 2, 0, ncols - 2, 0); //j == ncols - 1 //i == nrows/2 -> i == nrows - 2 I_CODE_WITH_J(ncols - 1, ncols - 2, 0); //j == ncols/2 -> j == ncols - 2 //i == nrows - 1 //J_CODE_WITH_I(nrows - 1, nrows - 2, 0); for (j = col_start; j < col_end; j++) { COUNT_AND_BOARD(inboard, outboard, neighbor_count, nrows - 1, j, nrows - 2, 0, j - 1, j + 1); } } //Main code part, no if/else branching //Unroleld once in the i dimension, as well as a block on j of size 4. int jj; for (jj = col_start; jj < col_end; jj+= J_BLOCK_SIZE) { for (j = jj; j < min(jj + J_BLOCK_SIZE, col_end); j++) { //Initializing sum mem_access[0] = 0; mem_access[1] = BOARD (inboard, row_start-1, j-1) + BOARD (inboard, row_start-1, j) + BOARD (inboard, row_start-1, j+1); mem_access[2] = BOARD (inboard, row_start, j-1) + BOARD (inboard, row_start, j) + BOARD (inboard, row_start, j+1); cent = 0; neighbor_count = mem_access[1] + mem_access[2]; for(i = row_start; i < row_end - 1; i+=2) { //1 neighbor_count += cent; cent = BOARD (inboard, i, j); neighbor_count = neighbor_count - mem_access[0] - cent; mem_access[0] = mem_access[1]; mem_access[1] = mem_access[2]; mem_access[2] = BOARD (inboard, i+1, j-1) + BOARD (inboard, i+1, j) + BOARD (inboard, i+1, j+1); neighbor_count += mem_access[2]; BOARD(outboard, i, j) = alivep (neighbor_count, cent); //2 neighbor_count += cent; cent = BOARD (inboard, i+1, j); neighbor_count = neighbor_count - mem_access[0] - cent; mem_access[0] = mem_access[1]; mem_access[1] = mem_access[2]; mem_access[2] = BOARD (inboard, i+2, j-1) + BOARD (inboard, i+2, j) + BOARD (inboard, i+2, j+1); neighbor_count += mem_access[2]; BOARD(outboard, i+1, j) = alivep (neighbor_count, cent); //COUNT_AND_BOARD_IJ(inboard, outboard, neighbor_count, i, j); } neighbor_count += cent; cent = BOARD (inboard, i, j); neighbor_count = neighbor_count - mem_access[0] - cent; mem_access[0] = mem_access[1]; mem_access[1] = mem_access[2]; mem_access[2] = BOARD (inboard, i+1, j-1) + BOARD (inboard, i+1, j) + BOARD (inboard, i+1, j+1); neighbor_count += mem_access[2]; BOARD(outboard, i, j) = alivep (neighbor_count, cent); } } //SWAP_BOARDS( outboard, inboard ); //I don't like that weird do while wrapper. char *temp = outboard; outboard = inboard; inboard = temp; pthread_mutex_lock(mutex); *status = *status | (1 << sector); if(*status == 0b1111){ *status = 0; pthread_cond_broadcast(cv); } else{ pthread_cond_wait(cv, mutex); } //Everyone finished working on their sector, can start the next sector pthread_mutex_unlock(mutex); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
char* game_of_life (char* outboard, char* inboard, const int nrows, const int ncols, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ const int LDA = nrows; int curgen; for (curgen = 0; curgen < gens_max; curgen++) { /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ /** * Pragma directive invoking Open MP parallelization for the two nester for loops * Ensured that i and j declarations happen within the scope of the open MP * parallelization so that each thread have their own dedicated i and j variables * **/ #pragma omp parallel num_threads(NUM_THREADS) { int i, j; //Need these inside omp pragme so that they are not shared between threads int thread_num = omp_get_thread_num(); //Gets the current threads num identifier //Split the outer for loop equally between all threads in NUM_THREADS for (i = thread_num*nrows/NUM_THREADS ; i < (thread_num+1)*nrows/NUM_THREADS; i++) { const int inorth = mod (i-1, nrows); //LCIM - mod only uses 'i' value const int isouth = mod (i+1, nrows); //Declare all eight neighbours and current cell. //Compute, north, north-east, current, east, south, and south-west char nw; char n = BOARD (inboard, inorth, mod (-1, ncols)); char ne = BOARD (inboard, inorth, 0); char w; char c = BOARD (inboard, i, mod (-1, ncols)); char e = BOARD (inboard, i, 0); char sw; char s = BOARD (inboard, isouth, mod (-1, ncols)); char se = BOARD (inboard, isouth, 0); for (j = 0; j < ncols; j++) { const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); //Shift the neighbour values to the left //This enables us to save computation in each stride of j //Only need to compute three new values each stride: // north-east // east // south-east nw = n; n = ne; ne = BOARD (inboard, inorth, jeast); w = c; c = e; e = BOARD (inboard, i, jeast); sw = s; s = se; se = BOARD (inboard, isouth, jeast); const char neighbor_count = nw + n + ne + w + e + sw + s + se; BOARD(outboard, i, j) = alivep (neighbor_count, c); } } } SWAP_BOARDS( outboard, inboard ); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }
void* parallel_game_of_life (void* arg) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ int inorth; int isouth; int jwest; int jeast; thread_struct *thread = (thread_struct *)arg; char* outboard = thread->outboard; char* inboard = thread->inboard; const int nrows = thread->nrows; const int ncols = thread->ncols; const int gens_max = thread->gens_max; pthread_barrier_t *bar = thread->bar; const int LDA = nrows; /** * dividing up the number of rows between the 4 threads */ int from = (thread->thread_num * nrows) / NUMBER_OF_THREADS; int to_row = ((thread->thread_num + 1) * nrows) / NUMBER_OF_THREADS; int curgen, i, j; /* HINT: you'll be parallelizing these loop(s) by doing a geometric decomposition of the output */ for (curgen = 0; curgen < gens_max; curgen++) { for (i = from; i < to_row; i++) { //Only use mod to calculate inorth and isouth if we're at the boundary if (i == 0 || i == nrows - 1) { inorth = mod (i-1, nrows); isouth = mod (i+1, nrows); } else { inorth = i-1; isouth = i+1; } for (j = 0; j < ncols; j++) { //Only use mod to calculate jwest and jeast if we're at the boundary if (j == 0 || j == ncols - 1) { jwest = mod (j-1, ncols); jeast = mod (j+1, ncols); } else { jwest = j-1; jeast = j+1; } const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, inorth, jeast) + BOARD (inboard, i, jwest) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jwest) + BOARD (inboard, isouth, j) + BOARD (inboard, isouth, jeast); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } pthread_barrier_wait(bar); SWAP_BOARDS( outboard, inboard ); } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return NULL; }
char* sequential_game_of_life (char* outboard_, char* inboard_, const int nrows_, const int ncols_, const int gens_max) { /* HINT: in the parallel decomposition, LDA may not be equal to nrows! */ nrows = nrows_; ncols = ncols_; outboard = outboard_; inboard = inboard_; LDA = nrows; slice = (nrows / NUM_THREADS); mask = nrows - 1; pthread_t *thread = (pthread_t*)malloc(NUM_THREADS * sizeof(pthread_t)); if (nrows_ <= 32 && ncols_ <= 32) { int curgen, i, j; for (curgen = 0; curgen < gens_max; curgen++) { for (i = 0; i < nrows; i++) { for (j = 0; j < ncols; j++) { const int inorth = mod (i-1, nrows); const int isouth = mod (i+1, nrows); const int jwest = mod (j-1, ncols); const int jeast = mod (j+1, ncols); const char neighbor_count = BOARD (inboard, inorth, jwest) + BOARD (inboard, i, jwest) + BOARD (inboard, isouth, jwest) + BOARD (inboard, inorth, j) + BOARD (inboard, i, jeast) + BOARD (inboard, isouth, jeast) + BOARD (inboard, inorth, jeast) + BOARD (inboard, isouth, j); BOARD(outboard, i, j) = alivep (neighbor_count, BOARD (inboard, i, j)); } } SWAP_BOARDS( outboard, inboard ); } } else { int curgen, i; for (curgen = 0; curgen < gens_max; curgen++) { for (i = 0; i < NUM_THREADS; i++) pthread_create (&thread[i], NULL, parallel_run, (void*)i); for (i = 0; i < NUM_THREADS; i++) pthread_join (thread[i], NULL); SWAP_BOARDS( outboard, inboard ); } } /* * We return the output board, so that we know which one contains * the final result (because we've been swapping boards around). * Just be careful when you free() the two boards, so that you don't * free the same one twice!!! */ return inboard; }