int main () { Kokkos::initialize (); srand (61391); // Set the random seed int nnumbers = 100000; view_type data ("RND", nnumbers); view_type result ("Prime", nnumbers); count_type count ("Count"); host_view_type h_data = Kokkos::create_mirror_view (data); host_view_type h_result = Kokkos::create_mirror_view (result); host_count_type h_count = Kokkos::create_mirror_view (count); typedef view_type::size_type size_type; // Fill the 'data' array on the host with random numbers. We assume // that they come from some process which is only implemented on the // host, via some library. (That's true in this case.) for (size_type i = 0; i < data.dimension_0 (); ++i) { h_data(i) = rand () % nnumbers; } Kokkos::deep_copy (data, h_data); // copy from host to device Kokkos::parallel_for (data.dimension_0 (), findprimes (data, result, count)); Kokkos::deep_copy (h_count, count); // copy from device to host printf ("Found %i prime numbers in %i random numbers\n", h_count(), nnumbers); Kokkos::finalize (); }
int main() { Kokkos::initialize(); srand(61391); int nnumbers = 100000; view_type data("RND",nnumbers); view_type result("Prime",nnumbers); count_type count("Count"); host_view_type h_data = Kokkos::create_mirror_view(data); host_view_type h_result = Kokkos::create_mirror_view(result); host_count_type h_count = Kokkos::create_mirror_view(count); typedef view_type::size_type size_type; for (size_type i = 0; i < data.dimension_0(); ++i) { h_data(i) = rand () % nnumbers; } Kokkos::deep_copy(data,h_data); Kokkos::parallel_for(data.dimension_0(),findprimes(data,result,count)); Kokkos::deep_copy(h_count,count); printf("Found %i prime numbers in %i random numbers\n",h_count(),nnumbers); Kokkos::finalize(); }
To reduce_all(CParam<Ti> in, bool change_nan, double nanval) { int in_elements = in.dims[0] * in.dims[1] * in.dims[2] * in.dims[3]; bool is_linear = (in.strides[0] == 1); for (int k = 1; k < 4; k++) { is_linear &= (in.strides[k] == (in.strides[k - 1] * in.dims[k - 1])); } // FIXME: Use better heuristics to get to the optimum number if (in_elements > 4096 || !is_linear) { if (is_linear) { in.dims[0] = in_elements; for (int k = 1; k < 4; k++) { in.dims[k] = 1; in.strides[k] = in_elements; } } uint threads_x = nextpow2(std::max(32u, (uint)in.dims[0])); threads_x = std::min(threads_x, THREADS_PER_BLOCK); uint threads_y = THREADS_PER_BLOCK / threads_x; Param<To> tmp; uint blocks_x = divup(in.dims[0], threads_x * REPEAT); uint blocks_y = divup(in.dims[1], threads_y); tmp.dims[0] = blocks_x; tmp.strides[0] = 1; for (int k = 1; k < 4; k++) { tmp.dims[k] = in.dims[k]; tmp.strides[k] = tmp.dims[k - 1] * tmp.strides[k - 1]; } int tmp_elements = tmp.strides[3] * tmp.dims[3]; auto tmp_alloc = memAlloc<To>(tmp_elements); tmp.ptr = tmp_alloc.get(); reduce_first_launcher<Ti, To, op>(tmp, in, blocks_x, blocks_y, threads_x, change_nan, nanval); std::vector<To> h_data(tmp_elements); CUDA_CHECK( cudaMemcpyAsync(h_data.data(), tmp.ptr, tmp_elements * sizeof(To), cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); Binary<To, op> reduce; To out = Binary<To, op>::init(); for (int i = 0; i < tmp_elements; i++) { out = reduce(out, h_data[i]); } return out; } else { std::vector<Ti> h_data(in_elements); CUDA_CHECK( cudaMemcpyAsync(h_data.data(), in.ptr, in_elements * sizeof(Ti), cudaMemcpyDeviceToHost, cuda::getActiveStream())); CUDA_CHECK(cudaStreamSynchronize(cuda::getActiveStream())); Transform<Ti, To, op> transform; Binary<To, op> reduce; To out = Binary<To, op>::init(); To nanval_to = scalar<To>(nanval); for (int i = 0; i < in_elements; i++) { To in_val = transform(h_data[i]); if (change_nan) in_val = !IS_NAN(in_val) ? in_val : nanval_to; out = reduce(out, in_val); } return out; } }
int main(int argc, char **argv) { /*********/ /* INPUT */ /*********/ if (argc < 3) throw std::runtime_error( "Usage: ./cmd_line_optical_flow <input.h5> <output.h5>"); // create files util::HDF5File in_file(argv[1]); util::HDF5File out_file(argv[2]); std::vector<int> image0_size, image1_size; std::vector<float> image0_data, image1_data; in_file.readArray("image0", image0_data, image0_size); in_file.readArray("image1", image1_data, image1_size); if (image0_size.size() != 2) throw std::runtime_error("Expecting 2D float image"); if (image0_size != image1_size) throw std::runtime_error("Expecting equal size images"); // default arguments vision::D_OpticalAndARFlow::Parameters parameters; parameters.n_scales_ = fetchScalar<int>(in_file, "n_scales", parameters.n_scales_); parameters.median_filter_ = (bool)fetchScalar<int>(in_file, "median_filter", parameters.median_filter_); parameters.consistent_ = (bool)fetchScalar<int>(in_file, "consistent", parameters.consistent_); parameters.cons_thres_ = fetchScalar<float>(in_file, "cons_thres", parameters.cons_thres_); parameters.four_orientations_ = fetchScalar<int>( in_file, "four_orientations", parameters.four_orientations_); // time execution? bool timing = (bool)fetchScalar<int>(in_file, "timing", false); /***********/ /* PROCESS */ /***********/ int width = image0_size.at(1); int height = image0_size.at(0); // this run also serves as warm-up for the timing code util::Device2D<float> d_image0(width, height); d_image0.copyFrom(image0_data); util::Device2D<float> d_image1(width, height); d_image1.copyFrom(image1_data); vision::D_OpticalAndARFlow optical_flow(d_image0, parameters); optical_flow.addImageReal(d_image1); optical_flow.updateOpticalFlowReal(); // output already since timing will overwrite auto &flow_x = optical_flow.getOpticalFlowX(); std::vector<int> h_size{ height, width }; std::vector<float> h_data(h_size.at(0) * h_size.at(1)); flow_x.copyTo(h_data); out_file.writeArray("optical_flow_x", h_data, h_size); auto &flow_y = optical_flow.getOpticalFlowY(); flow_y.copyTo(h_data); out_file.writeArray("optical_flow_y", h_data, h_size); // timing float image_copy_time, gabor_time, flow_time; if (timing) { int n_reps = 10; util::TimerGPU timer; // image copy timer.reset(); for (int r = 0; r < n_reps; r++) d_image1.copyFrom(image1_data); image_copy_time = timer.read() / (float)n_reps; // gabor filtering timer.reset(); for (int r = 0; r < n_reps; r++) optical_flow.addImageReal(d_image1); gabor_time = timer.read() / (float)n_reps; // optical flow timer.reset(); for (int r = 0; r < n_reps; r++) optical_flow.updateOpticalFlowReal(); flow_time = timer.read() / (float)n_reps; // output timers out_file.writeScalar("image_copy_time", image_copy_time); out_file.writeScalar("gabor_time", gabor_time); out_file.writeScalar("flow_time", flow_time); } return EXIT_SUCCESS; }