void _mp_copypriv_move_tls(void **blk_tp, int off, int size, int single_thread) { int lcpu; char *to; char *garbage = 0; if (single_thread != -1) { /* single thread */ if (*blk_tp == 0) singadr = (char*)__kmpc_threadprivate(0, single_thread, garbage, (size_t)size); else singadr = *blk_tp; singlen = size; } __kmpc_barrier(0, __kmpc_global_thread_num(0)); if (single_thread == -1) { /* single thread */ lcpu = __kmpc_global_thread_num(0); if (*blk_tp == 0) to = __kmpc_threadprivate(0, lcpu, garbage, (size_t)size); else to = *blk_tp; memcpy(to, singadr, size); } __kmpc_barrier(0, __kmpc_global_thread_num(0)); }
/* C/C++: copy a private stack or other other variable */ void _mp_copypriv(char *adr, long len, int thread) { if (thread == 0) { singadr = adr; singlen = len; } __kmpc_barrier(0, __kmpc_global_thread_num(0)); if (thread) memcpy(adr, singadr, singlen); __kmpc_barrier(0, __kmpc_global_thread_num(0)); }
/* Copy multiple items from master to children threads. * Don't use: keep for backward compatibility */ void _mp_copyin_move_multiple(int n_entries, void *data) { int i; const int tid = __kmpc_global_thread_num(NULL); struct pair_t {size_t size; void *data;}; if (tid != 0) { for (i=0; i<n_entries; ++i) { struct pair_t *item = (struct pair_t *)data + i; void *key = item->data; const size_t size = item->size; void *to = __kmpc_threadprivate_cached(NULL, tid, NULL, size, key); /* FIXME: Should this be 0 or the team master? * I think the gtid of team master. */ void *fr = __kmpc_threadprivate_cached(NULL, 0, NULL, size, key); if (to != fr) memcpy(to, fr, size); } } __kmpc_barrier(0, tid); }
/* C++: copy data from the master's block to the other threads blocks using the assignment operator vector_size is 1 for non arrays n for array[n] */ void _mp_copyin_move_cpp_new(void *blk_tp, int off, int class_size, int vector_size,assign_func_ptr assign_op, char* fr) { int lcpu; char *to; char *garbage = 0; int i; if (!fr) return; lcpu =__kmpc_global_thread_num(0); to = __kmpc_threadprivate_cached(0, lcpu, garbage, (size_t)(class_size * vector_size), blk_tp); for(i = 0 ; i < vector_size; i++) { if (to != fr) (*assign_op)(to, fr); to += class_size; fr += class_size; } }
void _mp_copyin_move_cpp(void *blk_tp, int off, int class_size, int vector_size,assign_func_ptr assign_op) { int lcpu; char *to, *fr; char *garbage = 0; int i; lcpu =__kmpc_global_thread_num(0); __kmpc_barrier(0, lcpu); if (lcpu != 0) { fr = __kmpc_threadprivate_cached(0, 0, garbage, (size_t) (class_size * vector_size), blk_tp); to = __kmpc_threadprivate_cached(0, lcpu, garbage, (size_t)(class_size * vector_size), blk_tp); for(i = 0 ; i < vector_size; i++) { if (to != fr) (*assign_op)(to, fr); to += class_size; fr += class_size; } } __kmpc_barrier(0, lcpu); }
void _mp_copyin_move_al(void *blk_tp, int off, long size) { int lcpu; char *to, *fr; char *garbage = 0; lcpu = __kmpc_global_thread_num(0); if (lcpu != 0) { fr = __kmpc_threadprivate_cached(0, 0, garbage, (size_t)size, blk_tp); to = __kmpc_threadprivate_cached(0, lcpu, garbage, (size_t)size, blk_tp); if (to && to != fr) { memcpy(to, fr, size); } } __kmpc_barrier(0, __kmpc_global_thread_num(0)); }
void _mp_copyin_move_tls(void *blk_tp, int off, int size) { int lcpu; char *to, *fr; char *garbage = 0; lcpu =__kmpc_global_thread_num(0); if (lcpu != 0) { fr = __kmpc_threadprivate(0, 0, garbage, (size_t)size); to = __kmpc_threadprivate(0, lcpu, garbage, (size_t)size); if (to != fr) memcpy(to, fr, size); } __kmpc_barrier(0, __kmpc_global_thread_num(0)); }
void _mp_copypriv_al(char **adr, long len, int thread) { if (thread == 0) { singadr = *adr; singlen = len; } __kmpc_barrier(0, __kmpc_global_thread_num(0)); if (thread) memcpy(*adr, singadr, singlen); __kmpc_barrier(0, __kmpc_global_thread_num(0)); /* reason for second barrier is that we want to wait until every thread * is done copying because we have only one singadr * if we have another mp_copypriv... we don't want to overwrite singadr */ }
void _mp_copypriv_move(void *blk_tp, int off, int size, int single_thread) { int lcpu; char *to; char *garbage = 0; if (single_thread != -1) { /* single thread */ singadr = __kmpc_threadprivate_cached(0, single_thread, garbage, (size_t)size, blk_tp); singlen = size; } __kmpc_barrier(0, __kmpc_global_thread_num(0)); if (single_thread == -1) { /* single thread */ lcpu = __kmpc_global_thread_num(0); to = __kmpc_threadprivate_cached(0, lcpu, garbage, (size_t)size, blk_tp); memcpy(to, singadr, size); } __kmpc_barrier(0, __kmpc_global_thread_num(0)); }
int main() { int dep; #pragma omp taskgroup { /* * Corresponds to: #pragma omp target nowait depend(out: dep) { my_sleep( 0.1 ); } */ kmp_depend_info_t dep_info; dep_info.base_addr = (long) &dep; dep_info.len = sizeof(int); // out = inout per spec and runtime expects this dep_info.flags.in = 1; dep_info.flags.out = 1; kmp_int32 gtid = __kmpc_global_thread_num(NULL); kmp_task_t *proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry); __kmpc_omp_task_with_deps(NULL,gtid,proxy_task,1,&dep_info,0,NULL); #pragma omp task depend(in: dep) { /* * Corresponds to: #pragma omp target nowait { my_sleep( 0.1 ); } */ kmp_task_t *nested_proxy_task = __kmpc_omp_task_alloc(NULL,gtid,17,sizeof(kmp_task_t),0,&task_entry); __kmpc_omp_task(NULL,gtid,nested_proxy_task); } } // only check that it didn't crash return 0; }
int main() { int i; int iter[N]; struct dim dims; for( i = 0; i < N; ++i ) iter[i] = 1; dims.lo = 1; dims.up = N-1; dims.st = 1; #pragma omp parallel num_threads(4) { int i, gtid; long long vec; gtid = __kmpc_global_thread_num(NULL); __kmpc_doacross_init(NULL,gtid,1,&dims); // thread starts the loop #pragma omp for nowait schedule(dynamic) for( i = 1; i < N; ++i ) { // runtime call corresponding to #pragma omp ordered depend(sink:i-1) vec=i-1; __kmpc_doacross_wait(NULL,gtid,&vec); // user's code iter[i] = iter[i-1] + 1; // runtime call corresponding to #pragma omp ordered depend(source) vec=i; __kmpc_doacross_post(NULL,gtid,&vec); } // thread finishes the loop (should be before the loop barrier) __kmpc_doacross_fini(NULL,gtid); } if( iter[N-1] == N ) { printf("passed\n"); } else { printf("failed %d != %d\n", iter[N-1], N); return 1; } return 0; }
// --------------------------------------------------------------------------- void run_loop( int loop_lb, // Loop lower bound. int loop_ub, // Loop upper bound. int loop_st, // Loop stride. int lchunk ) { static int volatile loop_sync = 0; int lb; // Chunk lower bound. int ub; // Chunk upper bound. int st; // Chunk stride. int rc; int tid = omp_get_thread_num(); int gtid = __kmpc_global_thread_num(&loc); int last; int tc = (loop_ub - loop_lb) / loop_st + 1; int ch; int no_chunk = 0; if (lchunk == 0) { no_chunk = 1; lchunk = 1; } ch = lchunk * SIMD_LEN; #if _DEBUG > 1 printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n", gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk); #endif // Don't test degenerate cases that should have been discovered by codegen. if (loop_st == 0) return; if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) return; __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd, loop_lb, loop_ub, loop_st, SIMD_LEN); { // Let the master thread handle the chunks alone. int chunk; // No of current chunk. int last_ub; // Upper bound of the last processed chunk. u64 cur; // Number of interations in current chunk. u64 max; // Max allowed iterations for current chunk. int undersized = 0; last_ub = loop_ub; chunk = 0; max = (loop_ub - loop_lb) / loop_st + 1; // The first chunk can consume all iterations. while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { ++ chunk; #if _DEBUG printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n", tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1)); #endif // Check if previous chunk (it is not the final chunk) is undersized. if (undersized) printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err); if (loop_st > 0) { if (!(ub <= loop_ub)) printf("Error with ub %d, %d, ch %d, err %d\n", (int)ub, (int)loop_ub, chunk, ++err); if (!(lb <= ub)) printf("Error with bounds %d, %d, %d, err %d\n", (int)lb, (int)ub, chunk, ++err); } else { if (!(ub >= loop_ub)) printf("Error with ub %d, %d, %d, err %d\n", (int)ub, (int)loop_ub, chunk, ++err); if (!(lb >= ub)) printf("Error with bounds %d, %d, %d, err %d\n", (int)lb, (int)ub, chunk, ++err); }; // if // Stride should not change. if (!(st == loop_st)) printf("Error with st %d, %d, ch %d, err %d\n", (int)st, (int)loop_st, chunk, ++err); cur = ( ub - lb ) / loop_st + 1; // Guided scheduling uses FP computations, so current chunk may // be a bit bigger (+1) than allowed maximum. if (!( cur <= max + 1)) printf("Error with iter %d, %d, err %d\n", cur, max, ++err); // Update maximum for the next chunk. if (last) { if (!no_chunk && cur > ch) printf("Error: too big last chunk %d (%d), tid %d, err %d\n", (int)cur, ch, tid, ++err); } else { if (cur % ch) printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n", chunk, (int)cur, ch, tid, ++err); } if (cur < max) max = cur; last_ub = ub; undersized = (cur < ch); #if _DEBUG > 1 if (last) printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n", undersized,cur,ch,tid,ub,lb,loop_st); #endif } // while // Must have the right last iteration index. if (loop_st > 0) { if (!(last_ub <= loop_ub)) printf("Error with last1 %d, %d, ch %d, err %d\n", (int)last_ub, (int)loop_ub, chunk, ++err); if (last && !(last_ub + loop_st > loop_ub)) printf("Error with last2 %d, %d, %d, ch %d, err %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); } else { if (!(last_ub >= loop_ub)) printf("Error with last1 %d, %d, ch %d, err %d\n", (int)last_ub, (int)loop_ub, chunk, ++err); if (last && !(last_ub + loop_st < loop_ub)) printf("Error with last2 %d, %d, %d, ch %d, err %d\n", (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); } // if } __kmpc_barrier(&loc, gtid); } // run_loop
void _mp_ecs_stdio(void) { __kmpc_end_critical(0, __kmpc_global_thread_num(0), &sem_stdio); }
void _mp_ecs(void) { __kmpc_end_critical(0, __kmpc_global_thread_num(0), &sem_cs); }
void _mp_v(kmp_critical_name *sem) { __kmpc_end_critical(0, __kmpc_global_thread_num(0), sem); }
void _mp_cdecl(void *blk, void ***blk_tp, int size) { __kmpc_threadprivate_cached(0, __kmpc_global_thread_num(0), (void*)blk, (size_t)size, blk_tp); }