bool concurrency_handler::_handler(void) {
  if (!_handler_initialized) {
      initialize_worker_thread_for_TAU();
      _handler_initialized = true;
  }
  if (_terminate) return true;
  apex* inst = apex::instance();
  if (inst == nullptr) return false; // running after finalization!
#ifdef APEX_HAVE_TAU
  if (apex_options::use_tau()) {
    TAU_START("concurrency_handler::_handler");
  }
#endif
  //cout << "HANDLER: " << endl;
  map<task_identifier, unsigned int> *counts = new(map<task_identifier, unsigned int>);
  stack<task_identifier>* tmp;
//  std::mutex* mut;
  for (unsigned int i = 0 ; i < _event_stack.size() ; i++) {
    if (_option > 1 && !thread_instance::map_id_to_worker(i)) {
      continue;
    }
    if (inst != nullptr && inst->get_state(i) == APEX_THROTTLED) { continue; }
    tmp = get_event_stack(i);
    task_identifier func;
    if (tmp != nullptr && tmp->size() > 0) {
      _per_thread_mutex[i]->lock();
      if (tmp->size() > 0) {
        func = tmp->top();
      } else {
        _per_thread_mutex[i]->unlock();
        continue;
      }
      _per_thread_mutex[i]->unlock();
      _function_mutex.lock();
      _functions.insert(func);
      _function_mutex.unlock();
      if (counts->find(func) == counts->end()) {
        (*counts)[func] = 1;
      } else {
        (*counts)[func] = (*counts)[func] + 1;
      }
    }
  }
  _states.push_back(counts);
  _thread_cap_samples.push_back(get_thread_cap());
  // TODO: FIXME multiple tuning sessions
  //for(auto param : get_tunable_params()) {
  //  _tunable_param_samples[param.first].push_back(*param.second);
  //}
  int power = current_power_high();
  _power_samples.push_back(power);
#ifdef APEX_HAVE_TAU
  if (apex_options::use_tau()) {
    TAU_STOP("concurrency_handler::_handler");
  }
#endif
  return true;
}
Exemple #2
0
int CLKernel::run(size_t gWorkSizeX, size_t gWorkSizeY, size_t gWorkSizeF,
                  size_t lWorkSizeX, size_t lWorkSizeY, size_t lWorkSizeF,
                  unsigned int nWait, cl_event * waitList, cl_event * ev)
{
   int status = CL_SUCCESS;

#ifdef PV_USE_OPENCL

   size_t local_work_size[3];
   size_t global_work_size[3];
   size_t max_local_size;

   global_work_size[0] = gWorkSizeX;
   global_work_size[1] = gWorkSizeY;
   global_work_size[2] = gWorkSizeF;

   local_work_size[0] = lWorkSizeX;
   local_work_size[1] = lWorkSizeY;
   local_work_size[2] = lWorkSizeF;

#ifdef PV_USE_TAU
   int tau_id = 10;
   TAU_START("CLKernel::run::CPU");
#endif

   // get the maximum work group size for executing the kernel on the device
   //
   status = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
                                     sizeof(size_t), &max_local_size, NULL);
   if (status != CL_SUCCESS) {
      fprintf(stderr, "Error: Failed to retrieve kernel work group info! (status==%d)\n", status);
      CLDevice::print_error_code(status);
      exit(status);
   } else {
//      printf("run: local_work_size==(%ld,%ld) global_work_size==(%ld,%ld)\n",
//             local_work_size[0], local_work_size[1], global_work_size[0], global_work_size[1]);
   }

   if (lWorkSizeX * lWorkSizeY * lWorkSizeF > max_local_size) {
      fprintf(stderr, "Error: Work size of %lu is bigger than max_local_size of %d\n", lWorkSizeX * lWorkSizeY * lWorkSizeF, (int)max_local_size);
      exit(-1);
   }

   // execute the kernel over the entire range of our 1d input data set
   // using the maximum number of work group items for this device
   //
   cl_event startMark, endMark;

   //TODO doesn't work on neuro
   if (profiling) {
      //TODO doesn't work on neuro
      printf("Profiling not implemented\n");
      exit(1);

      //TODO - why not use clEnqueueBarrierWithWaitList
      //int error = clEnqueueMarkerWithWaitList(commands, nWait, waitList, &startMark);
      //error |= clFinish(commands);
      //if (error) CLDevice::print_error_code(error);
   }
   
   if(local_work_size[0] == 0 || local_work_size[1] == 0 || local_work_size[2] == 0){
      status = clEnqueueNDRangeKernel(commands, kernel, 3, NULL,
                                      global_work_size, NULL, nWait, waitList, ev);
   }
   else{
      status = clEnqueueNDRangeKernel(commands, kernel, 3, NULL,
                                      global_work_size, local_work_size, nWait, waitList, ev);
   }
   //TODO doesn't work with neuro
   if (profiling) {
      //TODO doesn't work on neuro
      printf("Profiling not implemented\n");
      exit(1);
      //int error = clEnqueueMarkerWithWaitList(commands, nWait, waitList, &endMark);
      //error |= clFinish(commands);
      //if (error) CLDevice::print_error_code(error);
   }

   //clFinish(commands);
   if (status) {
      fprintf(stderr, "CLDevice::run(): Failed to execute kernel! (status==%d)\n", status);
      fprintf(stderr, "CLDevice::run(): max_local_work_size==%ld\n", max_local_size);
      CLDevice::print_error_code(status);
      exit(status);
   }

   // get profiling information
   //
   if (profiling) {
      //size_t param_size;
      cl_ulong start=0, end=0;
#ifdef PV_USE_TAU
      tau_id += 1000;
      TAU_STOP("CLKernel::run::CPU");
#endif
//      status = clGetEventProfilingInfo(*ev, CL_PROFILING_COMMAND_START,
//                                       sizeof(start), &start, &param_size);
//      status = clGetEventProfilingInfo(*ev, CL_PROFILING_COMMAND_END,
//                                       sizeof(end), &end, &param_size);
      status = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END,
                                       sizeof(start), &start, NULL);
      status |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END,
                                       sizeof(end), &end, NULL);
      if (status == 0) {
         elapsed = (end - start) / 1000;  // microseconds
      }
      //fprintf(stderr, "status %d\n",status);
      //CLDevice::print_error_code(status);
      //fprintf(stderr, "start %lu, end %lu, elapsed %u\n",(unsigned long)start, (unsigned long)end, elapsed);
#ifdef PV_USE_TAU
      Tau_opencl_register_gpu_event("CLKernel::run::GPU", tau_id, start, end);
#endif
   }

#endif // PV_USE_OPENCL

   return status;
}
Exemple #3
0
int main (int argc, char *argv[]) 
{
    validate_input(argc, argv);

    /*
     * Initialize TAU and start a timer for the main function.
     */
    TAU_INIT(&argc, &argv);
    TAU_PROFILE_SET_NODE(0);
    TAU_PROFILE_TIMER(tautimer, __func__, my_name, TAU_USER);
    TAU_PROFILE_START(tautimer);

    /*
     * Initialize MPI. We don't require threaded support, but with threads
     * we can send the TAU data over SOS asynchronously.
     */
    int rc = MPI_SUCCESS;
    int provided = 0;
    rc = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
    if (rc != MPI_SUCCESS) {
        char *errorstring;
        int length = 0;
        MPI_Error_string(rc, errorstring, &length);
        fprintf(stderr, "Error: MPI_Init failed, rc = %d\n%s\n", rc, errorstring);
        fflush(stderr);
        exit(99);
    }

    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
    my_printf("%s %s %d Running with comm_size %d\n", argv[0], my_name, getpid(), comm_size);
    MPI_Comm adios_comm;
    MPI_Comm_dup(MPI_COMM_WORLD, &adios_comm);

    adios_init ("arrays.xml", adios_comm);

    /*
     * Loop and do the things
     */
    int iter = 0;
    char tmpstr[256] = {0};
    int * return_codes = (int *)(calloc(num_sources,sizeof(int)));
    while (iter < iterations) {
        int index;
        /*
         * Read upstream input
         */
        for (index = 0 ; index < num_sources ; index++) {
            if (return_codes[index] > 0) {
                my_printf("%s source is gone\n", sources[index]);
                continue; // this input is gone
            }
            my_printf ("%s reading from %s.\n", my_name, sources[index]);
            sprintf(tmpstr,"%s READING FROM %s", my_name, sources[index]);
            TAU_START(tmpstr);
            //mpi_reader(adios_comm, sources[index]);
            return_codes[index] = flexpath_reader(adios_comm, index);
            TAU_STOP(tmpstr);
        }
        /*
        * "compute"
        */
        my_printf ("%s computing.\n", my_name);
        compute(iter);
        bool time_to_go = (num_sources == 0) ? (iter == (iterations-1)) : true;
        for (index = 0 ; index < num_sources ; index++) {
            if (return_codes[index] == 0) {
                time_to_go = false;
                break; // out of this for loop
            }
        }
        /*
         * Send output downstream
         */
        for (index = 0 ; index < num_sinks ; index++) {
            my_printf ("%s writing to %s.\n", my_name, sinks[index]);
            sprintf(tmpstr,"%s WRITING TO %s", my_name, sinks[index]);
            TAU_START(tmpstr);
            //mpi_writer(adios_comm, sinks[index]);
            flexpath_writer(adios_comm, index, (iter > 0), time_to_go);
            TAU_STOP(tmpstr);
        }
        if (time_to_go) {
            break; // out of the while loop
        }
        my_printf ("%s not time to go...\n", my_name);
        iter++;
    }

    /*
     * Finalize ADIOS
     */
    const char const * dot_filename = ".finished";
    if (num_sources > 0) {
        adios_read_finalize_method(ADIOS_READ_METHOD_FLEXPATH);
    #if 0
    } else {
        while (true) {
            // assume this is the main process. It can't exit until 
            // the last process is done.
            if( access( dot_filename, F_OK ) != -1 ) {
                // file exists
                unlink(dot_filename);
                break;
            } else {
                // file doesn't exist
                sleep(1);
            }
        }
    #endif
    }
    if (num_sinks > 0) {
        adios_finalize (my_rank);
    #if 0
    } else {
        // assume this is the last process. 
        // Tell the main process we are done.
        FILE *file;
        if (file = fopen(dot_filename, "w")) {
            fprintf(file, "done.\n");
            fclose(file);
        }
    #endif
    }

    /*
     * Finalize MPI
     */
    MPI_Comm_free(&adios_comm);
    MPI_Finalize();
    my_printf ("%s Done.\n", my_name);

    TAU_PROFILE_STOP(tautimer);
    return 0;
}
Exemple #4
0
 virtual void endAction(Caliper* c, const Attribute& attr, const Variant& value){
   std::stringstream ss;
   ss << attr.name() << "=" << value.to_string();
   std::string name = ss.str();
   TAU_STOP(name.c_str());
 }