Esempio n. 1
0
int main () {
  Kokkos::initialize ();

  srand (61391); // Set the random seed

  int nnumbers = 100000;
  view_type data ("RND", nnumbers);
  view_type result ("Prime", nnumbers);
  count_type count ("Count");

  host_view_type h_data = Kokkos::create_mirror_view (data);
  host_view_type h_result = Kokkos::create_mirror_view (result);
  host_count_type h_count = Kokkos::create_mirror_view (count);

  typedef view_type::size_type size_type;
  // Fill the 'data' array on the host with random numbers.  We assume
  // that they come from some process which is only implemented on the
  // host, via some library.  (That's true in this case.)
  for (size_type i = 0; i < data.dimension_0 (); ++i) {
    h_data(i) = rand () % nnumbers;
  }
  Kokkos::deep_copy (data, h_data); // copy from host to device

  Kokkos::parallel_for (data.dimension_0 (), findprimes (data, result, count));
  Kokkos::deep_copy (h_count, count); // copy from device to host

  printf ("Found %i prime numbers in %i random numbers\n", h_count(), nnumbers);
  Kokkos::finalize ();
}
Esempio n. 2
0
int main() {
  Kokkos::initialize();

  srand(61391);

  int nnumbers = 100000;
  view_type data("RND",nnumbers);
  view_type result("Prime",nnumbers);
  count_type count("Count");

  host_view_type h_data = Kokkos::create_mirror_view(data);
  host_view_type h_result = Kokkos::create_mirror_view(result);
  host_count_type h_count = Kokkos::create_mirror_view(count);

  typedef view_type::size_type size_type;
  for (size_type i = 0; i < data.dimension_0(); ++i) {
    h_data(i) = rand () % nnumbers;
  }

  Kokkos::deep_copy(data,h_data);

  Kokkos::parallel_for(data.dimension_0(),findprimes(data,result,count));
  Kokkos::deep_copy(h_count,count);

  printf("Found %i prime numbers in %i random numbers\n",h_count(),nnumbers);
  Kokkos::finalize();
}
Esempio n. 3
0
To reduce_all(CParam<Ti> in, bool change_nan, double nanval) {
    int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3];
    bool is_linear  = (in.strides[0] == 1);
    for (int k = 1; k < 4; k++) {
        is_linear &= (in.strides[k] == (in.strides[k - 1] * in.dims[k - 1]));
    }

    // FIXME: Use better heuristics to get to the optimum number
    if (in_elements > 4096 || !is_linear) {
        if (is_linear) {
            in.dims[0] = in_elements;
            for (int k = 1; k < 4; k++) {
                in.dims[k]    = 1;
                in.strides[k] = in_elements;
            }
        }

        uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0]));
        threads_x      = std::min(threads_x, THREADS_PER_BLOCK);
        uint threads_y = THREADS_PER_BLOCK / threads_x;

        Param<To> tmp;

        uint blocks_x = divup(in.dims[0], threads_x * REPEAT);
        uint blocks_y = divup(in.dims[1], threads_y);

        tmp.dims[0]    = blocks_x;
        tmp.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            tmp.dims[k]    = in.dims[k];
            tmp.strides[k] = tmp.dims[k - 1] * tmp.strides[k - 1];
        }

        int tmp_elements = tmp.strides[3] * tmp.dims[3];

        auto tmp_alloc = memAlloc<To>(tmp_elements);
        tmp.ptr        = tmp_alloc.get();
        reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y,
                                          threads_x, change_nan, nanval);

        std::vector<To> h_data(tmp_elements);
        CUDA_CHECK(
            cudaMemcpyAsync(h_data.data(), tmp.ptr, tmp_elements * sizeof(To),
                            cudaMemcpyDeviceToHost, cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));

        Binary<To, op> reduce;
        To out = Binary<To, op>::init();
        for (int i = 0; i < tmp_elements; i++) { out = reduce(out, h_data[i]); }

        return out;
    } else {
        std::vector<Ti> h_data(in_elements);
        CUDA_CHECK(
            cudaMemcpyAsync(h_data.data(), in.ptr, in_elements * sizeof(Ti),
                            cudaMemcpyDeviceToHost, cuda::getActiveStream()));
        CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream()));

        Transform<Ti, To, op> transform;
        Binary<To, op> reduce;
        To out       = Binary<To, op>::init();
        To nanval_to = scalar<To>(nanval);

        for (int i = 0; i < in_elements; i++) {
            To in_val = transform(h_data[i]);
            if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to;
            out = reduce(out, in_val);
        }

        return out;
    }
}
Esempio n. 4
0
int main(int argc, char **argv) {

  /*********/
  /* INPUT */
  /*********/

  if (argc < 3)
    throw std::runtime_error(
        "Usage: ./cmd_line_optical_flow <input.h5> <output.h5>");

  // create files
  util::HDF5File in_file(argv[1]);
  util::HDF5File out_file(argv[2]);

  std::vector<int> image0_size, image1_size;
  std::vector<float> image0_data, image1_data;

  in_file.readArray("image0", image0_data, image0_size);
  in_file.readArray("image1", image1_data, image1_size);

  if (image0_size.size() != 2)
    throw std::runtime_error("Expecting 2D float image");

  if (image0_size != image1_size)
    throw std::runtime_error("Expecting equal size images");

  // default arguments
  vision::D_OpticalAndARFlow::Parameters parameters;

  parameters.n_scales_ =
      fetchScalar<int>(in_file, "n_scales", parameters.n_scales_);
  parameters.median_filter_ = (bool)fetchScalar<int>(in_file, "median_filter",
                                                     parameters.median_filter_);
  parameters.consistent_ =
      (bool)fetchScalar<int>(in_file, "consistent", parameters.consistent_);
  parameters.cons_thres_ =
      fetchScalar<float>(in_file, "cons_thres", parameters.cons_thres_);
  parameters.four_orientations_ = fetchScalar<int>(
      in_file, "four_orientations", parameters.four_orientations_);

  // time execution?
  bool timing = (bool)fetchScalar<int>(in_file, "timing", false);

  /***********/
  /* PROCESS */
  /***********/

  int width = image0_size.at(1);
  int height = image0_size.at(0);

  // this run also serves as warm-up for the timing code
  util::Device2D<float> d_image0(width, height);
  d_image0.copyFrom(image0_data);
  util::Device2D<float> d_image1(width, height);
  d_image1.copyFrom(image1_data);
  vision::D_OpticalAndARFlow optical_flow(d_image0, parameters);
  optical_flow.addImageReal(d_image1);
  optical_flow.updateOpticalFlowReal();

  // output already since timing will overwrite
  auto &flow_x = optical_flow.getOpticalFlowX();
  std::vector<int> h_size{ height, width };
  std::vector<float> h_data(h_size.at(0) * h_size.at(1));
  flow_x.copyTo(h_data);
  out_file.writeArray("optical_flow_x", h_data, h_size);
  auto &flow_y = optical_flow.getOpticalFlowY();
  flow_y.copyTo(h_data);
  out_file.writeArray("optical_flow_y", h_data, h_size);

  // timing
  float image_copy_time, gabor_time, flow_time;
  if (timing) {
    int n_reps = 10;
    util::TimerGPU timer;

    // image copy
    timer.reset();
    for (int r = 0; r < n_reps; r++)
      d_image1.copyFrom(image1_data);
    image_copy_time = timer.read() / (float)n_reps;

    // gabor filtering
    timer.reset();
    for (int r = 0; r < n_reps; r++)
      optical_flow.addImageReal(d_image1);
    gabor_time = timer.read() / (float)n_reps;

    // optical flow
    timer.reset();
    for (int r = 0; r < n_reps; r++)
      optical_flow.updateOpticalFlowReal();
    flow_time = timer.read() / (float)n_reps;

    // output timers
    out_file.writeScalar("image_copy_time", image_copy_time);
    out_file.writeScalar("gabor_time", gabor_time);
    out_file.writeScalar("flow_time", flow_time);
  }

  return EXIT_SUCCESS;
}