Exemple #1
* In this tutorial we do not need additional auxiliary functions, allowing us to start right with main():
int main()
  //Change this type definition to double if your gpu supports that
  typedef float       ScalarType;

  * <h2> Scalar Operations </h2>
  * Although usually not very efficient because of PCI-Express latency, ViennaCL enables you to directly manipulate individual scalar values.
  * As such, a viennacl::scalar<double> behaves very similar to a normal `double`.
  * Let us define a few CPU and ViennaCL scalars:
  ScalarType s1 = ScalarType(3.1415926);   //note: writing ScalarType s1 = 3.1415926; leads to warnings with some compilers if ScalarType is 'float'.
  ScalarType s2 = ScalarType(2.71763);
  ScalarType s3 = ScalarType(42.0);

  viennacl::scalar<ScalarType> vcl_s1;
  viennacl::scalar<ScalarType> vcl_s2 = ScalarType(1.0);
  viennacl::scalar<ScalarType> vcl_s3 = ScalarType(1.0);

  * CPU scalars can be transparently assigned to GPU scalars and vice versa:
  std::cout << "Copying a few scalars..." << std::endl;
  vcl_s1 = s1;
  s2 = vcl_s2;
  vcl_s3 = s3;

  * Operations between GPU scalars work just as for CPU scalars:
  * (Note that such single compute kernels on the GPU are considerably slower than on the CPU)

  std::cout << "Manipulating a few scalars..." << std::endl;
  std::cout << "operator +=" << std::endl;
  s1 += s2;
  vcl_s1 += vcl_s2;

  std::cout << "operator *=" << std::endl;
  s1 *= s2;
  vcl_s1 *= vcl_s2;

  std::cout << "operator -=" << std::endl;
  s1 -= s2;
  vcl_s1 -= vcl_s2;

  std::cout << "operator /=" << std::endl;
  s1 /= s2;
  vcl_s1 /= vcl_s2;

  std::cout << "operator +" << std::endl;
  s1 = s2 + s3;
  vcl_s1 = vcl_s2 + vcl_s3;

  std::cout << "multiple operators" << std::endl;
  s1 = s2 + s3 * s2 - s3 / s1;
  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;

  * Operations can also be mixed:
  std::cout << "mixed operations" << std::endl;
  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;

  * The output stream is overloaded as well for direct printing to e.g. a terminal:

  std::cout << "CPU scalar s3: " << s3 << std::endl;
  std::cout << "GPU scalar vcl_s3: " << vcl_s3 << std::endl;

  *  <h2>Vector Operations
  * Define a few vectors (from STL and plain C) and viennacl::vectors
  std::vector<ScalarType>      std_vec1(10);
  std::vector<ScalarType>      std_vec2(10);
  ScalarType                   plain_vec3[10];  //plain C array

  viennacl::vector<ScalarType> vcl_vec1(10);
  viennacl::vector<ScalarType> vcl_vec2(10);
  viennacl::vector<ScalarType> vcl_vec3(10);

  * Let us fill the CPU vectors with random values:
  * (random<> is a helper function from Random.hpp)

  for (unsigned int i = 0; i < 10; ++i)
    std_vec1[i] = random<ScalarType>();
    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
    plain_vec3[i] = random<ScalarType>();

  * Copy the CPU vectors to the GPU vectors and vice versa
  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
  viennacl::copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector

  * Also partial copies by providing the corresponding iterators are possible:
  viennacl::copy(std_vec1.begin() + 4, std_vec1.begin() + 8, vcl_vec1.begin() + 4);   //cpu to gpu
  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, vcl_vec2.begin() + 1);   //gpu to gpu
  viennacl::copy(vcl_vec1.begin() + 4, vcl_vec1.begin() + 8, std_vec1.begin() + 1);   //gpu to cpu

  * Compute the inner product of two GPU vectors and write the result to either CPU or GPU
  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
  s2 = viennacl::linalg::inner_prod(std_vec1, std_vec2); //inner prod can also be used with std::vector (computations are carried out on CPU then)

  * Compute norms:
  s1 = viennacl::linalg::norm_1(vcl_vec1);
  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
  s3 = viennacl::linalg::norm_inf(vcl_vec3);

  * Plane rotation of two vectors:
  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);

  * Use viennacl::vector via the overloaded operators just as you would write it on paper:

  //simple expression:
  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;

  //more complicated expression:
  vcl_vec1 = vcl_vec2 / vcl_s3 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);

  * Swap the content of two vectors without a temporary vector:
  viennacl::swap(vcl_vec1, vcl_vec2);  //swaps all entries in memory
  viennacl::fast_swap(vcl_vec1, vcl_vec2); //swaps OpenCL memory handles only

  * The vectors can also be cleared directly:

  *  That's it, the tutorial is completed.
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
Exemple #2
int run_vector_benchmark(test_config & config, viennacl::io::parameter_database& paras)
  typedef viennacl::scalar<ScalarType>   VCLScalar;
  typedef viennacl::vector<ScalarType>   VCLVector;
  //set up a little bit of data to play with:
  //ScalarType std_result = 0;
  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
  std::vector<ScalarType> std_vec1(BENCHMARK_VECTOR_SIZE);  //used to set all values to zero

  viennacl::copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
  viennacl::copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
  typedef test_data<VCLScalar, VCLVector>   TestDataType;
  test_data<VCLScalar, VCLVector> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_vec3);

  ///////////// Start parameter recording  /////////////////
  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
  std::map< std::string, TimingType > all_timings;
  // vector addition  
  std::cout << "------- Related to vector addition ----------" << std::endl;
  config.kernel_name("add");                    optimize_full(paras, all_timings[config.kernel_name()], vector_add<TestDataType>, config, data);
  config.kernel_name("inplace_add");            optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_add<TestDataType>, config, data);
  config.kernel_name("mul_add");                optimize_full(paras, all_timings[config.kernel_name()], vector_mul_add<TestDataType>, config, data);
  config.kernel_name("cpu_mul_add");            optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_mul_add<TestDataType>, config, data);
  config.kernel_name("inplace_mul_add");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mul_add<TestDataType>, config, data);
  config.kernel_name("cpu_inplace_mul_add");    optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_inplace_mul_add<TestDataType>, config, data);
  config.kernel_name("inplace_div_add");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_div_add<TestDataType>, config, data);

  std::cout << "------- Related to vector subtraction ----------" << std::endl;
  config.kernel_name("sub");                    optimize_full(paras, all_timings[config.kernel_name()], vector_sub<TestDataType>, config, data);
  config.kernel_name("inplace_sub");            optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_sub<TestDataType>, config, data);
  config.kernel_name("mul_sub");                optimize_full(paras, all_timings[config.kernel_name()], vector_mul_sub<TestDataType>, config, data);
  config.kernel_name("inplace_mul_sub");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mul_sub<TestDataType>, config, data);
  config.kernel_name("inplace_div_sub");        optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_div_sub<TestDataType>, config, data);

  std::cout << "------- Related to vector scaling (mult/div) ----------" << std::endl;
  config.kernel_name("mult");                   optimize_full(paras, all_timings[config.kernel_name()], vector_mult<TestDataType>, config, data);
  config.kernel_name("inplace_mult");           optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_mult<TestDataType>, config, data);
  config.kernel_name("cpu_mult");               optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_mult<TestDataType>, config, data);
  config.kernel_name("cpu_inplace_mult");       optimize_full(paras, all_timings[config.kernel_name()], vector_cpu_inplace_mult<TestDataType>, config, data);
  config.kernel_name("divide");                 optimize_full(paras, all_timings[config.kernel_name()], vector_divide<TestDataType>, config, data);
  config.kernel_name("inplace_divide");         optimize_full(paras, all_timings[config.kernel_name()], vector_inplace_divide<TestDataType>, config, data);
  std::cout << "------- Others ----------" << std::endl;
  config.kernel_name("inner_prod");             optimize_full(paras, all_timings[config.kernel_name()], vector_inner_prod<TestDataType>, config, data);
  config.kernel_name("swap");                   optimize_full(paras, all_timings[config.kernel_name()], vector_swap<TestDataType>, config, data);
  config.kernel_name("clear");                  optimize_full(paras, all_timings[config.kernel_name()], vector_clear<TestDataType>, config, data);
  config.kernel_name("plane_rotation");         optimize_full(paras, all_timings[config.kernel_name()], vector_plane_rotation<TestDataType>, config, data);
  //config.max_work_groups(32); //otherwise failures on 8500 GT
  config.kernel_name("norm_1");                 optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_1<TestDataType>, config, data);
  config.kernel_name("norm_2");                 optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_2<TestDataType>, config, data);
  config.kernel_name("norm_inf");               optimize_restricted(paras, all_timings[config.kernel_name()], vector_norm_inf<TestDataType>, config, data);

  //restricted optimizations:
  config.kernel_name("index_norm_inf");         optimize_restricted(paras, all_timings[config.kernel_name()], vector_index_norm_inf<TestDataType>, config, data);
  return 0;
Exemple #3
int run_matrix_benchmark(test_config & config, viennacl::io::parameter_database& paras)
  typedef viennacl::scalar<ScalarType>   VCLScalar;
  typedef viennacl::vector<ScalarType>   VCLVector;
  typedef viennacl::matrix<ScalarType>   VCLMatrix;
  //set up a little bit of data to play with:
  //ScalarType std_result = 0;
  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
  std::vector<ScalarType> std_vec1(BENCHMARK_MATRIX_SIZE);  //used to set all values to zero
  std::vector< std::vector<ScalarType> > stl_mat(BENCHMARK_MATRIX_SIZE);  //store identity matrix here
  for (int i=0; i<BENCHMARK_MATRIX_SIZE; ++i)
    stl_mat[i][i] = 1.0;

  copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
  copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
  copy(stl_mat, vcl_mat);
  typedef test_data<VCLScalar, VCLVector, VCLMatrix>   TestDataType;
  test_data<VCLScalar, VCLVector, VCLMatrix> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_mat);

  ///////////// Start parameter recording  /////////////////
  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
  std::map< std::string, TimingType > all_timings;
  std::cout << "------- Related to direct solution algorithms ----------" << std::endl;
  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_trans_lower_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_lower_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_unit_lower_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_upper_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_trans_upper_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_unit_upper_triangular_substitute_inplace<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_lu_factorize<TestDataType>, config, data);

  //other kernels:
  std::cout << "------- Related to other operations ----------" << std::endl;
  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_rank1_update<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_scaled_rank1_update<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_vec_mul<TestDataType>, config, data);

  optimize_restricted(paras, all_timings[config.kernel_name()],
                      matrix_trans_vec_mul<TestDataType>, config, data);

  return 0;
int run_matrix_benchmark(test_config & config, viennacl::io::parameter_database& paras)
  typedef viennacl::scalar<ScalarType>   VCLScalar;
  typedef viennacl::vector<ScalarType>   VCLVector;
  typedef viennacl::compressed_matrix<ScalarType>   VCLMatrix;
  //set up a little bit of data to play with:
  //ScalarType std_result = 0;
  ScalarType std_factor1 = static_cast<ScalarType>(3.1415);
  ScalarType std_factor2 = static_cast<ScalarType>(42.0);
  viennacl::scalar<ScalarType> vcl_factor1(std_factor1);
  viennacl::scalar<ScalarType> vcl_factor2(std_factor2);
  std::vector<ScalarType> std_vec1(BENCHMARK_MATRIX_SIZE);  //used to set all values to zero
  std::vector< std::map< unsigned int, ScalarType> > stl_mat(BENCHMARK_MATRIX_SIZE);  //store identity matrix here
  for (int i=0; i<BENCHMARK_MATRIX_SIZE; ++i)
      if (i > 10)
          stl_mat[i][i - 10] = 1.0;
          stl_mat[i][i - 7] = 1.0;
          stl_mat[i][i - 4] = 1.0;
          stl_mat[i][i - 2] = 1.0;
      stl_mat[i][i] = 1.0;
      if (i + 10 < BENCHMARK_MATRIX_SIZE)
          stl_mat[i][i + 5] = 1.0;
          stl_mat[i][i + 7] = 1.0;
          stl_mat[i][i + 9] = 1.0;
          stl_mat[i][i + 10] = 1.0;

  viennacl::copy(std_vec1, vcl_vec1); //initialize vectors with all zeros (no need to worry about overflows then)
  viennacl::copy(std_vec1, vcl_vec2); //initialize vectors with all zeros (no need to worry about overflows then)
  viennacl::copy(stl_mat, vcl_mat);
  typedef test_data<VCLScalar, VCLVector, VCLMatrix>   TestDataType;
  test_data<VCLScalar, VCLVector, VCLMatrix> data(vcl_factor1, vcl_vec1, vcl_vec2, vcl_mat);

  ///////////// Start parameter recording  /////////////////
  typedef std::map< double, std::pair<unsigned int, unsigned int> >   TimingType;
  std::map< std::string, TimingType > all_timings;

  //other kernels:
  std::cout << "------- Related to other operations ----------" << std::endl;

  optimize_full(paras, all_timings[config.kernel_name()],
                      matrix_vec_mul<TestDataType>, config, data);

  return 0;
int main()
  typedef float       ScalarType;

  viennacl::vector<ScalarType> vcl_vec1(10);
  viennacl::vector<ScalarType> vcl_vec2(10);
  viennacl::vector<ScalarType> vcl_vec3(10);

  // Let us fill the CPU vectors with random values:
  // (random<> is a helper function from Random.hpp)

  for (unsigned int i = 0; i < 10; ++i)
    vcl_vec1[i] = ScalarType(i);
    vcl_vec2[i] = ScalarType(10 - i);

  // Build expression graph for the operation vcl_vec3 = vcl_vec1 + vcl_vec2
  // This requires the following expression graph:
  //             ( = )
  //            /      |
  //    vcl_vec3      ( + )
  //                 /     |
  //           vcl_vec1    vcl_vec2
  // One expression node consists of two leaves and the operation connecting the two.
  // Here we thus need two nodes: One for {vcl_vec3, = , link}, where 'link' points to the second node
  // {vcl_vec1, +, vcl_vec2}.
  // The following is the lowest level on which one could build the expression tree.
  // Even for a C API one would introduce some additional convenience layer such as add_vector_float_to_lhs(...); etc.
  typedef viennacl::scheduler::statement::container_type   NodeContainerType;   // this is just std::vector<viennacl::scheduler::statement_node>
  NodeContainerType expression_nodes(2);                                        //container with two nodes

  ////// First node //////

  // specify LHS of first node, i.e. vcl_vec3:
  expression_nodes[0].lhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;   // family of vectors
  expression_nodes[0].lhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;    // a dense vector
  expression_nodes[0].lhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;           // vector consisting of floats
  expression_nodes[0].lhs.vector_float = &vcl_vec3;                                 // provide pointer to vcl_vec3;

  // specify assignment operation for this node:
  expression_nodes[0].op.type_family   = viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY; // this is a binary operation, so both LHS and RHS operands are important
  expression_nodes[0].op.type          = viennacl::scheduler::OPERATION_BINARY_ASSIGN_TYPE; // assignment operation: '='

  // specify RHS: Just refer to the second node:
  expression_nodes[0].rhs.type_family  = viennacl::scheduler::COMPOSITE_OPERATION_FAMILY; // this links to another node (no need to set .subtype and .numeric_type)
  expression_nodes[0].rhs.node_index   = 1;                                               // index of the other node

  ////// Second node //////

  // LHS
  expression_nodes[1].lhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;   // family of vectors
  expression_nodes[1].lhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;    // a dense vector
  expression_nodes[1].lhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;           // vector consisting of floats
  expression_nodes[1].lhs.vector_float = &vcl_vec1;                                 // provide pointer to vcl_vec1

  // OP
  expression_nodes[1].op.type_family   = viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY; // this is a binary operation, so both LHS and RHS operands are important
  expression_nodes[1].op.type          = viennacl::scheduler::OPERATION_BINARY_ADD_TYPE;    // addition operation: '+'

  // RHS
  expression_nodes[1].rhs.type_family  = viennacl::scheduler::VECTOR_TYPE_FAMILY;  // family of vectors
  expression_nodes[1].rhs.subtype      = viennacl::scheduler::DENSE_VECTOR_TYPE;   // a dense vector
  expression_nodes[1].rhs.numeric_type = viennacl::scheduler::FLOAT_TYPE;          // vector consisting of floats
  expression_nodes[1].rhs.vector_float = &vcl_vec2;                                // provide pointer to vcl_vec2

  // create the full statement (aka. single line of code such as vcl_vec3 = vcl_vec1 + vcl_vec2):
  viennacl::scheduler::statement vec_addition(expression_nodes);

  // print it
  std::cout << vec_addition << std::endl;

  // run it

  // print vectors
  std::cout << "vcl_vec1: " << vcl_vec1 << std::endl;
  std::cout << "vcl_vec2: " << vcl_vec2 << std::endl;
  std::cout << "vcl_vec3: " << vcl_vec3 << std::endl;

  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
* With this let us go right to main():
int main()
  typedef float       ScalarType;

  * <h2>Part 1: Set up a custom context</h2>
  * The following is rather lengthy because OpenCL is a fairly low-level framework.
  * For comparison, the subsequent code explicitly performs the OpenCL setup that is done
  * in the background within the 'custom_kernels'-tutorial

  //manually set up a custom OpenCL context:
  std::vector<cl_device_id> device_id_array;

  //get all available devices
  viennacl::ocl::platform pf;
  std::cout << "Platform info: " << pf.info() << std::endl;
  std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT);
  std::cout << devices[0].name() << std::endl;
  std::cout << "Number of devices for custom context: " << devices.size() << std::endl;

  //set up context using all found devices:
  for (std::size_t i=0; i<devices.size(); ++i)

  std::cout << "Creating context..." << std::endl;
  cl_int err;
  cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err);

  //create two Vectors:
  unsigned int vector_size = 10;
  std::vector<ScalarType> vec1(vector_size);
  std::vector<ScalarType> vec2(vector_size);
  std::vector<ScalarType> result(vector_size);

  // fill the operands vec1 and vec2:
  for (unsigned int i=0; i<vector_size; ++i)
    vec1[i] = static_cast<ScalarType>(i);
    vec2[i] = static_cast<ScalarType>(vector_size-i);

  // create memory in OpenCL context:
  cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err);
  cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err);
  cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err);

  // create a command queue for each device:

  std::vector<cl_command_queue> queues(devices.size());
  for (std::size_t i=0; i<devices.size(); ++i)
    queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err);

  // create and build a program in the context:
  std::size_t source_len = std::string(my_compute_program).length();
  cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err);
  err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL);

/*            char buffer[1024];
            cl_build_status status;
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
            clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL);
            std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl;
            std::cout << "Log: " << buffer << std::endl;*/


  // create a kernel from the program:
  const char * kernel_name = "elementwise_prod";
  cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err);

  // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2;
  err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1);
  err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2);
  err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result);
  err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size);
  std::size_t global_size = vector_size;
  std::size_t local_size = vector_size;
  err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);

  // Read and output result:
  err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL);
  err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL);

  std::cout << "vec1  : ";
  for (std::size_t i=0; i<vec1.size(); ++i)
    std::cout << vec1[i] << " ";
  std::cout << std::endl;

  std::cout << "vec2  : ";
  for (std::size_t i=0; i<vec2.size(); ++i)
    std::cout << vec2[i] << " ";
  std::cout << std::endl;

  std::cout << "result: ";
  for (std::size_t i=0; i<result.size(); ++i)
    std::cout << result[i] << " ";
  std::cout << std::endl;

  * <h2>Part 2: Reuse Custom OpenCL Context with ViennaCL</h2>
  * To let ViennaCL reuse the previously created context, we need to make it known to ViennaCL \em before any ViennaCL objects are created.
  * We inject the custom context as the context with default id '0' when using viennacl::ocl::switch_context().
  viennacl::ocl::setup_context(0, my_context, device_id_array, queues);
  viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero)

  * Check that ViennaCL really uses the new context:
  std::cout << "Existing context: " << my_context << std::endl;
  std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl;

  * Wrap existing OpenCL objects into ViennaCL:
  viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size);
  viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size);
  viennacl::vector<ScalarType> vcl_result(mem_result, vector_size);
  viennacl::scalar<ScalarType> vcl_s = 2.0;

  std::cout << "Standard vector operations within ViennaCL:" << std::endl;
  vcl_result = vcl_s * vcl_vec1 + vcl_vec2;

  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;

  * We can also reuse the existing elementwise_prod kernel.
  * Therefore, we first have to make the existing program known to ViennaCL
  * For more details on the three lines, see tutorial 'custom-kernels'
  std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl;
  viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program");
  viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel(my_kernel, "elementwise_prod");
  viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size())));  //Note that std::size_t might differ between host and device. Thus, a cast to cl_uint is necessary here.

  std::cout << "vec1  : ";
  std::cout << vcl_vec1 << std::endl;

  std::cout << "vec2  : ";
  std::cout << vcl_vec2 << std::endl;

  std::cout << "result: ";
  std::cout << vcl_result << std::endl;

  * Since a linear piece of memory can be interpreted in several ways,
  * we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/
  * The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products:
  viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3);

  vcl_vec2.resize(3);   //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied)
  vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2);

  std::cout << "result of matrix-vector product: ";
  std::cout << vcl_result << std::endl;

  *  Any further operations can be carried out in the same way.
  *  Just keep in mind that any resizing of vectors or matrices leads to a reallocation of the underlying memory buffer, through which the 'wrapper' is lost.
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return EXIT_SUCCESS;
int main(int argc, char * argv[])
  TAU_INIT(&argc, &argv)
  TAU_PROFILE_TIMER(orio_maintimer, "main()", "int (int, char **)", TAU_USER);
    std::vector<viennacl::ocl::device> devices = viennacl::ocl::platform().devices();
    std::vector<cl_device_id> my_devices;
    viennacl::ocl::setup_context(0L, my_devices);

  //Change this type definition to double if your gpu supports that
  typedef double       ScalarType;
  ///////////// Vector operations /////////////////
  void * orio_profiler;
  TAU_PROFILER_CREATE(orio_profiler, "orio_generated_code", "", TAU_USER);

  for(int i = 0; i < 3; ++i) {

  // Define a few vectors (from STL and plain C) and viennacl::vectors
  std::vector<ScalarType>      std_vec1(1000000);
  std::vector<ScalarType>      std_vec2(1000000);

  viennacl::vector<ScalarType> vcl_vec1(1000000);
  viennacl::vector<ScalarType> vcl_vec2(1000000);
  viennacl::scalar<ScalarType> vcl_s1 = ScalarType(5.0);

  // Let us fill the CPU vectors with random values:
  // (random<> is a helper function from Random.hpp)
  for (unsigned int i = 0; i < 1000000; ++i)
    std_vec1[i] = random<ScalarType>(); 
    std_vec2[i] = 0.0;
  // Copy the CPU vectors to the GPU vectors and vice versa
  viennacl::copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
  viennacl::copy(std_vec2.begin(), std_vec2.end(), vcl_vec2.begin()); //either the STL way

  vcl_vec2 += vcl_s1 * vcl_vec1;

  viennacl::copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin());

  double orio_inclusive[TAU_MAX_COUNTERS];
  TAU_PROFILER_GET_INCLUSIVE_VALUES(orio_profiler, &orio_inclusive);
  printf("{'/*@ coordinate @*/' : %g}\n", orio_inclusive[0]);
  return EXIT_SUCCESS;
Exemple #8
int main()
  //Change this type definition to double if your gpu supports that
  typedef float       ScalarType;

  // Choose the Phi (WORKS)
  viennacl::ocl::set_context_device_type(0, viennacl::ocl::accelerator_tag());

  ///////////// Scalar operations /////////////////

  // Define a few CPU scalars:
  ScalarType s1 = static_cast<ScalarType>(3.1415926);
  ScalarType s2 = static_cast<ScalarType>(2.71763);
  ScalarType s3 = static_cast<ScalarType>(42.0);

  // ViennaCL scalars are defined in the same way:
  viennacl::scalar<ScalarType> vcl_s1;
  viennacl::scalar<ScalarType> vcl_s2 = static_cast<ScalarType>(1.0);
  viennacl::scalar<ScalarType> vcl_s3 = static_cast<ScalarType>(1.0);

  // CPU scalars can be transparently assigned to GPU scalars and vice versa:
  vcl_s1 = s1;
  s2 = vcl_s2;
  vcl_s3 = s3;

  // Operations between GPU scalars work just as for CPU scalars:
  // (Note that such single compute kernels on the GPU are considerably slower than on the CPU)

  s1 += s2;
  vcl_s1 += vcl_s2;

  s1 *= s2;
  vcl_s1 *= vcl_s2;

  s1 -= s2;
  vcl_s1 -= vcl_s2;

  s1 /= s2;
  vcl_s1 /= vcl_s2;

  s1 = s2 + s3;
  vcl_s1 = vcl_s2 + vcl_s3;

  s1 = s2 + s3 * s2 - s3 / s1;
  vcl_s1 = vcl_s2 + vcl_s3 * vcl_s2 - vcl_s3 / vcl_s1;

  // Operations can also be mixed:

  vcl_s1 = s1 * vcl_s2 + s3 - vcl_s3;

  // Output stream is overloaded as well:

  std::cout << "CPU scalar s2: " << s2 << std::endl;
  std::cout << "GPU scalar vcl_s2: " << vcl_s2 << std::endl;

  std::vector< viennacl::ocl::device > devices = viennacl::ocl::platform().devices();

  for (int i = 0; i < devices.size(); i++) {
      std::cout << devices[i].info() << "\n";

  std::cout << "SELECTED DEVICE: \n";
  std::cout << viennacl::ocl::current_context().current_device().info() << "\n";

  ///////////// Vector operations /////////////////

  // Define a few vectors (from STL and plain C) and viennacl::vectors
  std::vector<ScalarType>      std_vec1(10);
  std::vector<ScalarType>      std_vec2(10);
  ScalarType                   plain_vec3[10];  //plain C array

  viennacl::vector<ScalarType> vcl_vec1(10);
  viennacl::vector<ScalarType> vcl_vec2(10);
  viennacl::vector<ScalarType> vcl_vec3(10);

  // Let us fill the CPU vectors with random values:
  // (random<> is a helper function from Random.hpp)

  for (unsigned int i = 0; i < 10; ++i)
    std_vec1[i] = random<ScalarType>();
    vcl_vec2(i) = random<ScalarType>();  //also works for GPU vectors, but is MUCH slower (approx. factor 10.000) than the CPU analogue
    plain_vec3[i] = random<ScalarType>();

  // Copy the CPU vectors to the GPU vectors and vice versa
  copy(std_vec1.begin(), std_vec1.end(), vcl_vec1.begin()); //either the STL way
  copy(vcl_vec2.begin(), vcl_vec2.end(), std_vec2.begin()); //either the STL way
  copy(vcl_vec2, std_vec2);                                 //using the short hand notation for objects that provide .begin() and .end() members
  copy(vcl_vec2.begin(), vcl_vec2.end(), plain_vec3);       //copy to plain C vector

  // Compute the inner product of two GPU vectors and write the result to either CPU or GPU

  vcl_s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);
  s1 = viennacl::linalg::inner_prod(vcl_vec1, vcl_vec2);

  // Compute norms:

  s1 = viennacl::linalg::norm_1(vcl_vec1);
  vcl_s2 = viennacl::linalg::norm_2(vcl_vec2);
  s3 = viennacl::linalg::norm_inf(vcl_vec3);

  // Plane rotation of two vectors:

  viennacl::linalg::plane_rotation(vcl_vec1, vcl_vec2, 1.1f, 2.3f);

  // Use viennacl::vector via the overloaded operators just as you would write it on paper:

  //simple expression:
  vcl_vec1 = vcl_s1 * vcl_vec2 / vcl_s3;

  //more complicated expression:
  vcl_vec1 = vcl_vec2 / vcl_s1 + vcl_s2 * (vcl_vec1 - vcl_s2 * vcl_vec2);

  // Swap the content of two vectors without a temporary vector:

  swap(vcl_vec1, vcl_vec2);

  //  That's it. Move on to the second tutorial, where dense matrices are explained.
  std::cout << "!!!! TUTORIAL 1 COMPLETED SUCCESSFULLY !!!!" << std::endl;

  return 0;