void TestVectorCppEquality(void) { #if 1 KNOWN_FAILURE; #else thrust::host_vector<int> h_a(3); thrust::host_vector<int> h_b(3); thrust::host_vector<int> h_c(3); h_a[0] = 0; h_a[1] = 1; h_a[2] = 2; h_b[0] = 0; h_b[1] = 1; h_b[2] = 3; h_b[0] = 0; h_b[1] = 1; thrust::device_vector<int> d_a(3); thrust::device_vector<int> d_b(3); thrust::device_vector<int> d_c(3); d_a[0] = 0; d_a[1] = 1; d_a[2] = 2; d_b[0] = 0; d_b[1] = 1; d_b[2] = 3; d_b[0] = 0; d_b[1] = 1; ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true); ASSERT_EQUAL((d_a == d_a), true); ASSERT_EQUAL((h_b == h_b), true); ASSERT_EQUAL((h_b == d_b), true); ASSERT_EQUAL((d_b == h_b), true); ASSERT_EQUAL((d_b == d_b), true); ASSERT_EQUAL((h_c == h_c), true); ASSERT_EQUAL((h_c == d_c), true); ASSERT_EQUAL((d_c == h_c), true); ASSERT_EQUAL((d_c == d_c), true); ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false); ASSERT_EQUAL((h_b == h_a), false); ASSERT_EQUAL((h_b == d_a), false); ASSERT_EQUAL((d_b == h_a), false); ASSERT_EQUAL((d_b == d_a), false); ASSERT_EQUAL((h_a == h_c), false); ASSERT_EQUAL((h_a == d_c), false); ASSERT_EQUAL((d_a == h_c), false); ASSERT_EQUAL((d_a == d_c), false); ASSERT_EQUAL((h_c == h_a), false); ASSERT_EQUAL((h_c == d_a), false); ASSERT_EQUAL((d_c == h_a), false); ASSERT_EQUAL((d_c == d_a), false); ASSERT_EQUAL((h_b == h_c), false); ASSERT_EQUAL((h_b == d_c), false); ASSERT_EQUAL((d_b == h_c), false); ASSERT_EQUAL((d_b == d_c), false); ASSERT_EQUAL((h_c == h_b), false); ASSERT_EQUAL((h_c == d_b), false); ASSERT_EQUAL((d_c == h_b), false); ASSERT_EQUAL((d_c == d_b), false); #endif }
/** * seed generator with 32-bit integer */ void seed(unsigned int value) { // compute leapfrog multipliers for initialization cuda::vector<uint48> g_A(dim.threads()), g_C(dim.threads()); cuda::configure(dim.grid, dim.block); get_rand48_kernel().leapfrog(g_A); // compute leapfrog addends for initialization cuda::copy(g_A, g_C); algorithm::gpu::scan<uint48> scan(g_C.size(), dim.threads_per_block()); scan(g_C); // initialize generator with seed cuda::vector<uint48> g_a(1), g_c(1); cuda::host::vector<uint48> h_a(1), h_c(1); cuda::configure(dim.grid, dim.block); get_rand48_kernel().seed(g_A, g_C, g_a, g_c, g_state_, value); cuda::copy(g_a, h_a); cuda::copy(g_c, h_c); // set leapfrog constants for constant device memory rng_.a = h_a.front(); rng_.c = h_c.front(); rng_.g_state = g_state_.data(); }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c (LENGTH, 0xdeadbeef); // c = a + b, from compute device cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); util::Timer timer; vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c, count); queue.finish(); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; printf("\nThe kernels ran in %lf seconds\n", rtime); cl::copy(queue, d_c, begin(h_c), end(h_c)); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i]; // expected value for d_c[i] tmp -= h_c[i]; // compute errors if(tmp*tmp < TOL*TOL) { // correct if square deviation is less correct++; // than tolerance squared } else { printf( " tmp %f h_a %f h_b %f h_c %f \n", tmp, h_a[i], h_b[i], h_c[i]); } } // summarize results printf( "vector add to find C = A+B: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c (LENGTH, 0xdeadbeef); // c vector (result) std::vector<float> h_d (LENGTH, 0xdeadbeef); // d vector (result) std::vector<float> h_e (LENGTH); // e vector std::vector<float> h_f (LENGTH, 0xdeadbeef); // f vector (result) std::vector<float> h_g (LENGTH); // g vector cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector cl::Buffer d_d; // device memory used for the output d vector cl::Buffer d_e; // device memory used for the input e vector cl::Buffer d_f; // device memory used for the output f vector cl::Buffer d_g; // device memory used for the input g vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_e[i] = rand() / (float)RAND_MAX; h_g[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_e = cl::Buffer(context, begin(h_e), end(h_e), true); d_g = cl::Buffer(context, begin(h_g), end(h_g), true); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(float) * LENGTH); d_d = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(float) * LENGTH); d_f = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_e, d_c, d_d); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_g, d_d, d_f); cl::copy(queue, d_f, begin(h_f), end(h_f)); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]; // assign element i of a+b+e+g to tmp tmp -= h_f[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else { printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]); } } // summarize results printf("C = A+B+E+G: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }