void runloop(int loopid) { #pragma omp parallel default(none) shared(loopid) { int myid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); int ipt = (int) ceil((double)N/(double)nthreads); int lo = myid*ipt; int hi = (myid+1)*ipt; if (hi > N) hi = N; // printf("thread %d has lo = %d and hi = %d \n", myid, lo, hi); int total_iters = hi-lo; int remaining_iters = hi-lo; int dist = ceil(remaining_iters/nthreads); int counter=0; while(remaining_iters>0) { dist = floor( remaining_iters / nthreads ) + 1; hi = lo + dist; // printf("thread : %d lo = %d hi = %d \n", myid, lo, hi); switch (loopid) { case 1: loop1chunk(lo,hi); break; case 2: loop2chunk(lo,hi); break; } counter += hi-lo; remaining_iters = total_iters - counter; lo = hi; } // printf("Final counter on thread %d = %d \n", myid, counter); } }
void runloop(int loopid) { #pragma omp parallel default(none) shared(loopid, remaining_iters, hi, lo, remaining_iters_lock) { int chunk, start_iter, end_iter, remaining_iters_tmp; int next_thread_id; int myid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); double K = (double) 1/nthreads;//k=1/p int ipt = (int) ceil((double)N/(double)nthreads); lo[myid] = myid*ipt; hi[myid] = (myid+1)*ipt; if (hi[myid] > N) hi[myid] = N; remaining_iters_tmp = hi[myid]-lo[myid]; remaining_iters[myid] = remaining_iters_tmp; while(remaining_iters_tmp > 0) { get_chunks(myid, K, &start_iter, &chunk); /* Set DEBUG flag to TRUE if you want to see the flow details*/ if(DEBUG==TRUE) print_run_details("Own", loopid, myid, myid, start_iter, chunk); switch(loopid){ case 1: loop1chunk(start_iter, start_iter+chunk); case 2: loop2chunk(start_iter, start_iter+chunk); } remaining_iters_tmp = read_remaining_iters(myid); }//end while loop 1 get_most_loaded_thread_details(nthreads, &next_thread_id, &remaining_iters_tmp); while(remaining_iters_tmp >0){ get_chunks(next_thread_id, K, &start_iter, &chunk); /* Set DEBUG flag to TRUE if you want to see the flow details*/ if(DEBUG==TRUE) print_run_details("Affinity", loopid, myid, next_thread_id, start_iter, chunk); switch(loopid){ case 1: loop1chunk(start_iter, start_iter+chunk); case 2: loop2chunk(start_iter, start_iter+chunk); } get_most_loaded_thread_details(nthreads, &next_thread_id, &remaining_iters_tmp); }//end while loop 2 } }
/** * The idea is to implement a general work-stealing algorithm using critical sections (alternatives are discussed in the report)/ * However, rather than computing iterations owned by the current thread, we're going to steal from ourself. Once own own iterations have * been completed, we will start stealing from other threads. The preference will be given to threads with higher IDs due to the way the * work is distributed (this, again, is explained in the report). */ void runloop(int loopid) { int thread_count = omp_get_max_threads(); // the number of threads in the system. // we don't know how many exist yet, so use this. alternatively, we // could have used getenv() from <stdlib.h> to get the env variable, but // this seems cleaner. it should always work within our setup as well. int n_over_p = (int) ceil((double) N / (double) thread_count); // what it says on the tin float one_over_p = 1.0 / thread_count; // one over p int lower_bounds[thread_count]; // stores the lower bound of the array not already computed. int upper_bounds[thread_count]; // stores the upper bound of the array not already computed. // upper_bounds[i] - lower_bounds[i] = remaining iterations #pragma omp parallel default(none) \ shared(thread_count, loopid, lower_bounds, upper_bounds, n_over_p, one_over_p) { int thread_id = omp_get_thread_num(), thread_low = thread_id * n_over_p, thread_high = ((thread_id + 1) * n_over_p) > N ? N : (thread_id + 1) * n_over_p; // in case n mod p != 0 lower_bounds[thread_id] = thread_low; upper_bounds[thread_id] = thread_high; // We need to ensure that the last iteration does not compute twice. Although this could be done with an if statement below the // switch, I feel that it should be achievable in a more succict method. Thus, in the first iteration we will perform no work // which allows findThreadToSteaFrom() to perform it's computation and update current_low and current_high. Hence, the second // iteration is the first one that will perform any work. int current_low = 0, current_high = 0, stealing_from = 0; while(stealing_from != -1) { switch(loopid) { case 1: loop1chunk(current_low, current_high); break; case 2: loop2chunk(current_low, current_high); break; } // Find the next current_low and current_high. Notice the use of pointers to these values as replacements for C#/C++-style out params. // This would go nicely in the while loop condition, but unfortunately we need the #pragma block. #pragma omp critical { stealing_from = findThreadToStealFrom(lower_bounds, upper_bounds, thread_count, thread_id, one_over_p, ¤t_low, ¤t_high); } } } }
void runloop(int loopid) { int global_work_remaining[omp_get_max_threads()]; omp_lock_t writelock; omp_init_lock(&writelock); #pragma omp parallel default(none) shared(global_work_remaining, writelock, loopid, waiting_time, loop_time,a, b, c) { int i; int start_time, stop_time; int my_id = omp_get_thread_num(); int nthreads = omp_get_num_threads(); int ipt = (int) ceil((double)N/(double)nthreads); /* there should be as many chunks as there are threads * and they should have roughly identical ranges */ int chunk_id = my_id; int chunk_lo = chunk_id*ipt; int chunk_hi = (chunk_id+1)*ipt; if (chunk_hi > N) chunk_hi = N; int chunk_range = chunk_hi-chunk_lo; /* these are the variables that tell how much * work a thread is doing in a chunk */ int local_lo, local_hi, local_work; /* initialise the shared array*/ global_work_remaining[my_id] = chunk_range; #pragma omp barrier /* continue to do work unless there is no work left to do */ while(1) { start_time = omp_get_wtime(); omp_set_lock(&writelock); if(global_work_remaining[chunk_id] == 0) { int old_id = chunk_id; for(i=0; i<nthreads; i++) { if(global_work_remaining[chunk_id] < global_work_remaining[i]) { chunk_id = i; } } if(old_id == chunk_id) { omp_unset_lock(&writelock); break; } else { chunk_hi = (chunk_id+1)*ipt; if (chunk_hi > N) chunk_hi = N; chunk_range = global_work_remaining[chunk_id]; } } else { chunk_range = global_work_remaining[chunk_id]; } local_work = floor((double)chunk_range/(double)nthreads); if(local_work < 1) local_work = 1; global_work_remaining[chunk_id] -= local_work; omp_unset_lock(&writelock); local_lo = chunk_hi - chunk_range; local_hi = local_lo + local_work; waiting_time[my_id] += omp_get_wtime() - start_time; start_time = omp_get_wtime(); switch (loopid) { case 1: loop1chunk(local_lo,local_hi); break; case 2: loop2chunk(local_lo,local_hi); break; } loop_time[my_id] += omp_get_wtime() -start_time; } } }
void runloop(int loopid) { struct block* blocks; //Declaring the struct #pragma omp parallel default(none) shared(loopid, blocks) //start of parallel region { int myid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); #pragma omp single { blocks=(struct block*)malloc(sizeof(struct block)*nthreads); //initialising the struct } int ipt = (int) ceil((double)N/(double)nthreads); int lo = myid*ipt; int hi = (myid+1)*ipt; if (hi > N) hi = N; int r = hi - lo; int num_iters= (int)ceil((double)r/(double)nthreads); int most_work; int loc_most_work; int max=0; #pragma omp critical //members of the struct must be updated within critical regions to ensure synchronisation and avoid race condition { blocks[myid].high=hi; blocks[myid].remaining=r; printf("Thread %d has remaining %d and num iters is%d\n",myid, blocks[myid].remaining,num_iters); } //each thread does its own iterations in this while loop while(blocks[myid].remaining>0){ //critical region to update struct members #pragma omp critical { num_iters= (int)ceil((double)(blocks[myid].remaining)/(double)nthreads); lo=blocks[myid].high - blocks[myid].remaining; hi=lo + num_iters; blocks[myid].remaining = blocks[myid].remaining - num_iters; num_iters= (int)ceil((double)(blocks[myid].remaining)/(double)nthreads); } //printing working iterations printf("Thread %d iterating from %d to %d with %d remaining\n", myid, lo, hi, blocks[myid].remaining ); //run through the loop if(blocks[myid].remaining>=0){ switch (loopid) { case 1: loop1chunk(lo,hi); break; case 2: loop2chunk(lo,hi); break; } } } //do while loop for work stealing from most load thread by idle threads do { loc_most_work=-1; most_work=0; int remaining; //updating members and finding how much work the most loaded thread has, and which is most loaded //which also needs to be done inside a critical region #pragma omp critical { if(blocks[myid].remaining==0){ int i; for(i=0;i<nthreads;i++){ if (blocks[i].remaining>most_work){ most_work = blocks[i].remaining; loc_most_work=i; } } if(loc_most_work>=0){ if(most_work>=0){ num_iters= (int)ceil((double)(blocks[loc_most_work].remaining)/(double)nthreads); lo=blocks[loc_most_work].high - blocks[loc_most_work].remaining; hi=lo + num_iters; if (hi > N) hi = N; blocks[loc_most_work].remaining -= num_iters; } } } } //ensuring synchronisation if(myid>=0){ if(loc_most_work>=0){ switch (loopid) { case 1: loop1chunk(lo,hi); break; case 2: loop2chunk(lo,hi); break; } //printing the work steals printf("Thread %d stealing from thread %d iterating %d to %d with %d remaining\n",myid, loc_most_work, lo, hi, blocks[loc_most_work].remaining); } } }while(most_work>0); //iterations only done while other threads have work left to do } free(blocks); //freeing blocks so there are no memory leakages }