TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL( Kokkos_View_MP, DeepCopy_HostArray, Storage, Layout ) { typedef typename Storage::execution_space Device; typedef typename Storage::value_type Scalar; typedef Sacado::MP::Vector<Storage> Vector; typedef typename ApplyView<Vector*,Layout,Device>::type ViewType; typedef typename ViewType::size_type size_type; typedef typename ViewType::HostMirror host_view_type; typedef typename host_view_type::array_type host_array_type; const size_type num_rows = global_num_rows; const size_type num_cols = Storage::is_static ? Storage::static_size : global_num_cols; ViewType v("view", num_rows, num_cols); host_array_type h_a = Kokkos::create_mirror_view(v); bool is_right = Kokkos::Impl::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value; if (is_right) { for (size_type i=0; i<num_rows; ++i) for (size_type j=0; j<num_cols; ++j) h_a(i,j) = generate_vector_coefficient<Scalar>( num_rows, num_cols, i, j); } else { for (size_type i=0; i<num_rows; ++i) for (size_type j=0; j<num_cols; ++j) h_a(j,i) = generate_vector_coefficient<Scalar>( num_rows, num_cols, i, j); } Kokkos::deep_copy(v, h_a); success = checkVectorView(v, out); }
bool checkVectorView(const ViewType& v, Teuchos::FancyOStream& out) { typedef ViewType view_type; typedef typename view_type::size_type size_type; typedef typename view_type::HostMirror host_view_type; typedef typename host_view_type::array_type host_array_type; typedef typename host_array_type::value_type scalar_type; // Copy to host host_view_type h_v = Kokkos::create_mirror_view(v); Kokkos::deep_copy(h_v, v); host_array_type h_a = h_v; size_type num_rows, num_cols; // For static, layout left, sacado dimension becomes first dimension // instead of last bool is_right = Kokkos::Impl::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value; if (is_right) { num_rows = h_a.dimension_0(); num_cols = h_a.dimension_1(); } else { num_rows = h_a.dimension_1(); num_cols = h_a.dimension_0(); } bool success = true; if (is_right) { for (size_type i=0; i<num_rows; ++i) { for (size_type j=0; j<num_cols; ++j) { scalar_type val = h_a(i,j); scalar_type val_expected = generate_vector_coefficient<scalar_type>( num_rows, num_cols, i, j); TEUCHOS_TEST_EQUALITY(val, val_expected, out, success); } } } else { for (size_type i=0; i<num_rows; ++i) { for (size_type j=0; j<num_cols; ++j) { scalar_type val = h_a(j,i); scalar_type val_expected = generate_vector_coefficient<scalar_type>( num_rows, num_cols, i, j); TEUCHOS_TEST_EQUALITY(val, val_expected, out, success); } } } return success; }
void TestVectorCppEquality(void) { #if 1 KNOWN_FAILURE; #else thrust::host_vector<int> h_a(3); thrust::host_vector<int> h_b(3); thrust::host_vector<int> h_c(3); h_a[0] = 0; h_a[1] = 1; h_a[2] = 2; h_b[0] = 0; h_b[1] = 1; h_b[2] = 3; h_b[0] = 0; h_b[1] = 1; thrust::device_vector<int> d_a(3); thrust::device_vector<int> d_b(3); thrust::device_vector<int> d_c(3); d_a[0] = 0; d_a[1] = 1; d_a[2] = 2; d_b[0] = 0; d_b[1] = 1; d_b[2] = 3; d_b[0] = 0; d_b[1] = 1; ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true); ASSERT_EQUAL((d_a == d_a), true); ASSERT_EQUAL((h_b == h_b), true); ASSERT_EQUAL((h_b == d_b), true); ASSERT_EQUAL((d_b == h_b), true); ASSERT_EQUAL((d_b == d_b), true); ASSERT_EQUAL((h_c == h_c), true); ASSERT_EQUAL((h_c == d_c), true); ASSERT_EQUAL((d_c == h_c), true); ASSERT_EQUAL((d_c == d_c), true); ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false); ASSERT_EQUAL((h_b == h_a), false); ASSERT_EQUAL((h_b == d_a), false); ASSERT_EQUAL((d_b == h_a), false); ASSERT_EQUAL((d_b == d_a), false); ASSERT_EQUAL((h_a == h_c), false); ASSERT_EQUAL((h_a == d_c), false); ASSERT_EQUAL((d_a == h_c), false); ASSERT_EQUAL((d_a == d_c), false); ASSERT_EQUAL((h_c == h_a), false); ASSERT_EQUAL((h_c == d_a), false); ASSERT_EQUAL((d_c == h_a), false); ASSERT_EQUAL((d_c == d_a), false); ASSERT_EQUAL((h_b == h_c), false); ASSERT_EQUAL((h_b == d_c), false); ASSERT_EQUAL((d_b == h_c), false); ASSERT_EQUAL((d_b == d_c), false); ASSERT_EQUAL((h_c == h_b), false); ASSERT_EQUAL((h_c == d_b), false); ASSERT_EQUAL((d_c == h_b), false); ASSERT_EQUAL((d_c == d_b), false); #endif }
TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL( Kokkos_View_MP, DeepCopy_DeviceArray, Storage, Layout ) { typedef typename Storage::device_type Device; typedef typename Storage::value_type Scalar; typedef Sacado::MP::Vector<Storage> Vector; typedef typename ApplyView<Vector*,Layout,Device>::type ViewType; typedef typename ViewType::size_type size_type; typedef typename ViewType::HostMirror host_view_type; typedef typename host_view_type::array_type host_array_type; typedef typename ViewType::array_type array_type; const size_type num_rows = global_num_rows; const size_type num_cols = Storage::is_static ? Storage::static_size : global_num_cols; ViewType v("view", num_rows, num_cols); host_view_type h_v = Kokkos::create_mirror_view(v); host_array_type h_a = h_v; array_type a = v; for (size_type i=0; i<num_rows; ++i) for (size_type j=0; j<num_cols; ++j) h_a(i,j) = generate_vector_coefficient<Scalar>( num_rows, num_cols, i, j); Kokkos::deep_copy(a, h_v); success = checkVectorView(v, out); }
int main() { Kokkos::initialize(); { view_type a ("A", 10); // If view_type and host_mirror_type live in the same memory space, // a "mirror view" is just an alias, and deep_copy does nothing. // Otherwise, a mirror view of a device View lives in host memory, // and deep_copy does a deep copy. host_view_type h_a = Kokkos::create_mirror_view (a); // The View h_a lives in host (CPU) memory, so it's legal to fill // the view sequentially using ordinary code, like this. for (int i = 0; i < 10; i++) { for (int j = 0; j < 3; j++) { h_a(i,j) = i*10 + j; } } Kokkos::deep_copy (a, h_a); // Copy from host to device. int sum = 0; Kokkos::parallel_reduce (10, ReduceFunctor (a), sum); printf ("Result is %i\n",sum); } Kokkos::finalize (); }
/** * seed generator with 32-bit integer */ void seed(unsigned int value) { // compute leapfrog multipliers for initialization cuda::vector<uint48> g_A(dim.threads()), g_C(dim.threads()); cuda::configure(dim.grid, dim.block); get_rand48_kernel().leapfrog(g_A); // compute leapfrog addends for initialization cuda::copy(g_A, g_C); algorithm::gpu::scan<uint48> scan(g_C.size(), dim.threads_per_block()); scan(g_C); // initialize generator with seed cuda::vector<uint48> g_a(1), g_c(1); cuda::host::vector<uint48> h_a(1), h_c(1); cuda::configure(dim.grid, dim.block); get_rand48_kernel().seed(g_A, g_C, g_a, g_c, g_state_, value); cuda::copy(g_a, h_a); cuda::copy(g_c, h_c); // set leapfrog constants for constant device memory rng_.a = h_a.front(); rng_.c = h_c.front(); rng_.g_state = g_state_.data(); }
int do_memory_uplo(int n, W& workspace ) { typedef typename bindings::remove_imaginary<T>::type real_type ; typedef ublas::matrix<T, ublas::column_major> matrix_type ; typedef ublas::vector<real_type> vector_type ; typedef ublas::hermitian_adaptor<matrix_type, UPLO> hermitian_type; // Set matrix matrix_type a( n, n ); a.clear(); vector_type e1( n ); vector_type e2( n ); fill( a ); matrix_type a2( a ); matrix_type z( a ); // Compute Schur decomposition. fortran_int_t m; ublas::vector<fortran_int_t> ifail(n); hermitian_type h_a( a ); lapack::heevx( 'V', 'A', h_a, real_type(0.0), real_type(1.0), 2, n-1, real_type(1e-28), m, e1, z, ifail, workspace ) ; if (check_residual( a2, e1, z )) return 255 ; hermitian_type h_a2( a2 ); lapack::heevx( 'N', 'A', h_a2, real_type(0.0), real_type(1.0), 2, n-1, real_type(1e-28), m, e2, z, ifail, workspace ) ; if (norm_2( e1 - e2 ) > n * norm_2( e1 ) * std::numeric_limits< real_type >::epsilon()) return 255 ; // Test for a matrix range fill( a ); a2.assign( a ); typedef ublas::matrix_range< matrix_type > matrix_range ; typedef ublas::hermitian_adaptor<matrix_range, UPLO> hermitian_range_type; ublas::range r(1,n-1) ; matrix_range a_r( a, r, r ); matrix_range z_r( z, r, r ); ublas::vector_range< vector_type> e_r( e1, r ); ublas::vector<fortran_int_t> ifail_r(n-2); hermitian_range_type h_a_r( a_r ); lapack::heevx( 'V', 'A', h_a_r, real_type(0.0), real_type(1.0), 2, n-1, real_type(1e-28), m, e_r, z_r, ifail_r, workspace ); matrix_range a2_r( a2, r, r ); if (check_residual( a2_r, e_r, z_r )) return 255 ; return 0 ; } // do_memory_uplo()
TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL( Kokkos_View_MP, Unmanaged, Storage, Layout ) { typedef typename Storage::device_type Device; typedef typename Storage::value_type Scalar; typedef Sacado::MP::Vector<Storage> Vector; typedef typename ApplyView<Vector*,Layout,Device>::type ViewType; typedef typename ViewType::size_type size_type; typedef typename ViewType::HostMirror host_view_type; typedef typename host_view_type::array_type host_array_type; const size_type num_rows = global_num_rows; const size_type num_cols = Storage::is_static ? Storage::static_size : global_num_cols; ViewType v("view", num_rows, num_cols); host_view_type h_v = Kokkos::create_mirror_view(v); host_array_type h_a = h_v; bool is_right = Kokkos::Impl::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value; if (is_right || !ViewType::is_contiguous) { for (size_type i=0; i<num_rows; ++i) for (size_type j=0; j<num_cols; ++j) h_a(i,j) = generate_vector_coefficient<Scalar>( num_rows, num_cols, i, j); } else { for (size_type i=0; i<num_rows; ++i) for (size_type j=0; j<num_cols; ++j) h_a(j,i) = generate_vector_coefficient<Scalar>( num_rows, num_cols, i, j); } Kokkos::deep_copy(v, h_v); // Create unmanaged view ViewType v2(Kokkos::view_without_managing, v.ptr_on_device(), num_rows, num_cols); success = checkVectorView(v2, out); }
int main() { Kokkos::initialize(); view_type a("A",10); host_view_type h_a = Kokkos::create_mirror_view(a); for(int i = 0; i < 10; i++) for(int j = 0; j < 3; j++) h_a(i,j) = i*10 + j; Kokkos::deep_copy(a,h_a); int sum = 0; Kokkos::parallel_reduce(10,squaresum(a),sum); printf("Result is %i\n",sum); Kokkos::finalize(); }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c (LENGTH, 0xdeadbeef); // c = a + b, from compute device cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); util::Timer timer; vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c, count); queue.finish(); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; printf("\nThe kernels ran in %lf seconds\n", rtime); cl::copy(queue, d_c, begin(h_c), end(h_c)); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i]; // expected value for d_c[i] tmp -= h_c[i]; // compute errors if(tmp*tmp < TOL*TOL) { // correct if square deviation is less correct++; // than tolerance squared } else { printf( " tmp %f h_a %f h_b %f h_c %f \n", tmp, h_a[i], h_b[i], h_c[i]); } } // summarize results printf( "vector add to find C = A+B: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c (LENGTH, 0xdeadbeef); // c vector (result) std::vector<float> h_d (LENGTH, 0xdeadbeef); // d vector (result) std::vector<float> h_e (LENGTH); // e vector std::vector<float> h_f (LENGTH, 0xdeadbeef); // f vector (result) std::vector<float> h_g (LENGTH); // g vector cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector cl::Buffer d_d; // device memory used for the output d vector cl::Buffer d_e; // device memory used for the input e vector cl::Buffer d_f; // device memory used for the output f vector cl::Buffer d_g; // device memory used for the input g vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_e[i] = rand() / (float)RAND_MAX; h_g[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_e = cl::Buffer(context, begin(h_e), end(h_e), true); d_g = cl::Buffer(context, begin(h_g), end(h_g), true); d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(float) * LENGTH); d_d = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(float) * LENGTH); d_f = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_e, d_c, d_d); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_g, d_d, d_f); cl::copy(queue, d_f, begin(h_f), end(h_f)); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]; // assign element i of a+b+e+g to tmp tmp -= h_f[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else { printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]); } } // summarize results printf("C = A+B+E+G: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }
int mc_update_path(DAL_Context* ctx) { // QUESTION: Do we need to prepend the bucket and ns->alias to the // objid? For now We can just flatten the url to make // things easy. // shorthand PathInfo* info = &(MC_FH(ctx)->info); const MarFS_Repo* repo = info->pre.repo; char* objid = info->pre.objid; char* path_template = MC_CONTEXT(ctx)->path_template; char obj_filename[MARFS_MAX_OBJID_SIZE]; strncpy(obj_filename, objid, MARFS_MAX_OBJID_SIZE); flatten_objid(obj_filename); // We will use the hash in multiple places, save it to avoid // recomputing. // // Hash the actual object ID so the hash will remain the same, // regadless of changes to the "file-ification" format. unsigned long objid_hash = polyhash(objid); char *mc_path_format = repo->host; unsigned int num_blocks = MC_CONFIG(ctx)->n+MC_CONFIG(ctx)->e; unsigned int num_pods = MC_CONFIG(ctx)->num_pods; unsigned int num_cap = MC_CONFIG(ctx)->num_cap; unsigned int scatter_width = MC_CONFIG(ctx)->scatter_width; unsigned int seed = objid_hash; uint64_t a[3]; int i; for(i = 0; i < 3; i++) a[i] = rand_r(&seed) * 2 + 1; // generate 32 random bits MC_CONTEXT(ctx)->pod = objid_hash % num_pods; MC_CONTEXT(ctx)->cap = h_a(objid_hash, a[0]) % num_cap; unsigned long scatter = h_a(objid_hash, a[1]) % scatter_width; MC_CONTEXT(ctx)->start_block = h_a(objid_hash, a[2]) % num_blocks; // fill in path template // the mc_path_format is sometheing like: // "<protected-root>/repo10+2/pod%d/block%s/cap%d/scatter%d/" snprintf(path_template, MC_MAX_PATH_LEN, mc_path_format, MC_CONTEXT(ctx)->pod, "%d", // this will be filled in by the ec library MC_CONTEXT(ctx)->cap, scatter); // be robust to vairation in the config... We could always just add // a slash, but that will get ugly in the logs. if(path_template[strlen(path_template) - 1] != '/') strcat(path_template, "/"); // append the fileified object id strncat(path_template, obj_filename, MARFS_MAX_OBJID_SIZE); LOG(LOG_INFO, "MC path template: (starting block: %d) %s\n", MC_CONTEXT(ctx)->start_block, path_template); return 0; }