bool concurrency_handler::_handler(void) { if (!_handler_initialized) { initialize_worker_thread_for_TAU(); _handler_initialized = true; } if (_terminate) return true; apex* inst = apex::instance(); if (inst == nullptr) return false; // running after finalization! #ifdef APEX_HAVE_TAU if (apex_options::use_tau()) { TAU_START("concurrency_handler::_handler"); } #endif //cout << "HANDLER: " << endl; map<task_identifier, unsigned int> *counts = new(map<task_identifier, unsigned int>); stack<task_identifier>* tmp; // std::mutex* mut; for (unsigned int i = 0 ; i < _event_stack.size() ; i++) { if (_option > 1 && !thread_instance::map_id_to_worker(i)) { continue; } if (inst != nullptr && inst->get_state(i) == APEX_THROTTLED) { continue; } tmp = get_event_stack(i); task_identifier func; if (tmp != nullptr && tmp->size() > 0) { _per_thread_mutex[i]->lock(); if (tmp->size() > 0) { func = tmp->top(); } else { _per_thread_mutex[i]->unlock(); continue; } _per_thread_mutex[i]->unlock(); _function_mutex.lock(); _functions.insert(func); _function_mutex.unlock(); if (counts->find(func) == counts->end()) { (*counts)[func] = 1; } else { (*counts)[func] = (*counts)[func] + 1; } } } _states.push_back(counts); _thread_cap_samples.push_back(get_thread_cap()); // TODO: FIXME multiple tuning sessions //for(auto param : get_tunable_params()) { // _tunable_param_samples[param.first].push_back(*param.second); //} int power = current_power_high(); _power_samples.push_back(power); #ifdef APEX_HAVE_TAU if (apex_options::use_tau()) { TAU_STOP("concurrency_handler::_handler"); } #endif return true; }
int CLKernel::run(size_t gWorkSizeX, size_t gWorkSizeY, size_t gWorkSizeF, size_t lWorkSizeX, size_t lWorkSizeY, size_t lWorkSizeF, unsigned int nWait, cl_event * waitList, cl_event * ev) { int status = CL_SUCCESS; #ifdef PV_USE_OPENCL size_t local_work_size[3]; size_t global_work_size[3]; size_t max_local_size; global_work_size[0] = gWorkSizeX; global_work_size[1] = gWorkSizeY; global_work_size[2] = gWorkSizeF; local_work_size[0] = lWorkSizeX; local_work_size[1] = lWorkSizeY; local_work_size[2] = lWorkSizeF; #ifdef PV_USE_TAU int tau_id = 10; TAU_START("CLKernel::run::CPU"); #endif // get the maximum work group size for executing the kernel on the device // status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_local_size, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "Error: Failed to retrieve kernel work group info! (status==%d)\n", status); CLDevice::print_error_code(status); exit(status); } else { // printf("run: local_work_size==(%ld,%ld) global_work_size==(%ld,%ld)\n", // local_work_size[0], local_work_size[1], global_work_size[0], global_work_size[1]); } if (lWorkSizeX * lWorkSizeY * lWorkSizeF > max_local_size) { fprintf(stderr, "Error: Work size of %lu is bigger than max_local_size of %d\n", lWorkSizeX * lWorkSizeY * lWorkSizeF, (int)max_local_size); exit(-1); } // execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // cl_event startMark, endMark; //TODO doesn't work on neuro if (profiling) { //TODO doesn't work on neuro printf("Profiling not implemented\n"); exit(1); //TODO - why not use clEnqueueBarrierWithWaitList //int error = clEnqueueMarkerWithWaitList(commands, nWait, waitList, &startMark); //error |= clFinish(commands); //if (error) CLDevice::print_error_code(error); } if(local_work_size[0] == 0 || local_work_size[1] == 0 || local_work_size[2] == 0){ status = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global_work_size, NULL, nWait, waitList, ev); } else{ status = clEnqueueNDRangeKernel(commands, kernel, 3, NULL, global_work_size, local_work_size, nWait, waitList, ev); } //TODO doesn't work with neuro if (profiling) { //TODO doesn't work on neuro printf("Profiling not implemented\n"); exit(1); //int error = clEnqueueMarkerWithWaitList(commands, nWait, waitList, &endMark); //error |= clFinish(commands); //if (error) CLDevice::print_error_code(error); } //clFinish(commands); if (status) { fprintf(stderr, "CLDevice::run(): Failed to execute kernel! (status==%d)\n", status); fprintf(stderr, "CLDevice::run(): max_local_work_size==%ld\n", max_local_size); CLDevice::print_error_code(status); exit(status); } // get profiling information // if (profiling) { //size_t param_size; cl_ulong start=0, end=0; #ifdef PV_USE_TAU tau_id += 1000; TAU_STOP("CLKernel::run::CPU"); #endif // status = clGetEventProfilingInfo(*ev, CL_PROFILING_COMMAND_START, // sizeof(start), &start, ¶m_size); // status = clGetEventProfilingInfo(*ev, CL_PROFILING_COMMAND_END, // sizeof(end), &end, ¶m_size); status = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(start), &start, NULL); status |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); if (status == 0) { elapsed = (end - start) / 1000; // microseconds } //fprintf(stderr, "status %d\n",status); //CLDevice::print_error_code(status); //fprintf(stderr, "start %lu, end %lu, elapsed %u\n",(unsigned long)start, (unsigned long)end, elapsed); #ifdef PV_USE_TAU Tau_opencl_register_gpu_event("CLKernel::run::GPU", tau_id, start, end); #endif } #endif // PV_USE_OPENCL return status; }
int main (int argc, char *argv[]) { validate_input(argc, argv); /* * Initialize TAU and start a timer for the main function. */ TAU_INIT(&argc, &argv); TAU_PROFILE_SET_NODE(0); TAU_PROFILE_TIMER(tautimer, __func__, my_name, TAU_USER); TAU_PROFILE_START(tautimer); /* * Initialize MPI. We don't require threaded support, but with threads * we can send the TAU data over SOS asynchronously. */ int rc = MPI_SUCCESS; int provided = 0; rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); if (rc != MPI_SUCCESS) { char *errorstring; int length = 0; MPI_Error_string(rc, errorstring, &length); fprintf(stderr, "Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring); fflush(stderr); exit(99); } MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); my_printf("%s %s %d Running with comm_size %d\n", argv[0], my_name, getpid(), comm_size); MPI_Comm adios_comm; MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm); adios_init ("arrays.xml", adios_comm); /* * Loop and do the things */ int iter = 0; char tmpstr[256] = {0}; int * return_codes = (int *)(calloc(num_sources,sizeof(int))); while (iter < iterations) { int index; /* * Read upstream input */ for (index = 0 ; index < num_sources ; index++) { if (return_codes[index] > 0) { my_printf("%s source is gone\n", sources[index]); continue; // this input is gone } my_printf ("%s reading from %s.\n", my_name, sources[index]); sprintf(tmpstr,"%s READING FROM %s", my_name, sources[index]); TAU_START(tmpstr); //mpi_reader(adios_comm, sources[index]); return_codes[index] = flexpath_reader(adios_comm, index); TAU_STOP(tmpstr); } /* * "compute" */ my_printf ("%s computing.\n", my_name); compute(iter); bool time_to_go = (num_sources == 0) ? (iter == (iterations-1)) : true; for (index = 0 ; index < num_sources ; index++) { if (return_codes[index] == 0) { time_to_go = false; break; // out of this for loop } } /* * Send output downstream */ for (index = 0 ; index < num_sinks ; index++) { my_printf ("%s writing to %s.\n", my_name, sinks[index]); sprintf(tmpstr,"%s WRITING TO %s", my_name, sinks[index]); TAU_START(tmpstr); //mpi_writer(adios_comm, sinks[index]); flexpath_writer(adios_comm, index, (iter > 0), time_to_go); TAU_STOP(tmpstr); } if (time_to_go) { break; // out of the while loop } my_printf ("%s not time to go...\n", my_name); iter++; } /* * Finalize ADIOS */ const char const * dot_filename = ".finished"; if (num_sources > 0) { adios_read_finalize_method(ADIOS_READ_METHOD_FLEXPATH); #if 0 } else { while (true) { // assume this is the main process. It can't exit until // the last process is done. if( access( dot_filename, F_OK ) != -1 ) { // file exists unlink(dot_filename); break; } else { // file doesn't exist sleep(1); } } #endif } if (num_sinks > 0) { adios_finalize (my_rank); #if 0 } else { // assume this is the last process. // Tell the main process we are done. FILE *file; if (file = fopen(dot_filename, "w")) { fprintf(file, "done.\n"); fclose(file); } #endif } /* * Finalize MPI */ MPI_Comm_free(&adios_comm); MPI_Finalize(); my_printf ("%s Done.\n", my_name); TAU_PROFILE_STOP(tautimer); return 0; }
virtual void endAction(Caliper* c, const Attribute& attr, const Variant& value){ std::stringstream ss; ss << attr.name() << "=" << value.to_string(); std::string name = ss.str(); TAU_STOP(name.c_str()); }