Esempio n. 1
0
    clblasFunc(StatisticalTimer& _timer, cl_device_type devType)
          : timer(_timer)
    {
        cl_int err;

        /* Setup OpenCL environment. */
        OPENCL_V_THROW(clGetPlatformIDs(1, &platform_, NULL),
                       "getting platform IDs");
        OPENCL_V_THROW(clGetDeviceIDs(platform_, devType, 1,
                                      &device_, NULL), "getting device IDs");
        props_[0] = CL_CONTEXT_PLATFORM;
        props_[1] = (cl_context_properties)platform_;
        props_[2] = 0;
        ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err);
        OPENCL_V_THROW(err, "creating context");
        queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);

        timer_id = timer.getUniqueID( "clfunc", 0 );


        maxMemAllocSize = queryMemAllocSize( device_ );

    /* Setup clblas. */
        err = clblasSetup();
        if (err != CL_SUCCESS) {
            std::cerr << "clblasSetup() failed with %d\n";
            clReleaseCommandQueue(queue_);
            clReleaseContext(ctx_);
        }
    }
Esempio n. 2
0
cl_int Dgemm_internal(
  cl_env *env, double *a, double *b, double *c, double alpha, double beta,
  clblasTranspose transA, clblasTranspose transB, 
  int ar, int ac, int br, int bc, int cr, int cc, int size_a, int size_b, int size_c)
{
  CHECK(clblasSetup());
  cl_event events[NEVENTS];
  int nevent = 0;
  cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++]));
  cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_ONLY, &(events[nevent++]));
  cl_mem mem_c;
  if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++]));
  else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL);
  
  cl_int err = clblasDgemm(clblasColumnMajor, transA, transB,
    ar, bc, ac, alpha, mem_a, 0, ar, mem_b, 0, br, beta, mem_c, 0, cr,
    1, &(env->queues[0]), nevent, events, &(events[nevent]));
  CHECK(err);
  events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent]));
  CHECK(clWaitForEvents(1, &(events[nevent+1])));
  CHECK(clReleaseMemObject(mem_a));
  CHECK(clReleaseMemObject(mem_b));
  CHECK(clReleaseMemObject(mem_c));
  clblasTeardown();
  return CL_SUCCESS;
}
static int setup(gpucontext *ctx) {
  if (refcnt == 0) {
    CLB_CHECK(ctx->err, clblasSetup());
  }

  if (ctx->blas_handle == NULL)
    ctx->blas_handle = &refcnt;
  refcnt++;
  return GA_NO_ERROR;
}
Esempio n. 4
0
// ========================================
// initialization
// --------------------
extern "C" magma_int_t
magma_init()
{
    g_runtime.init();
    g_runtime.load_kernels( 1, &clmagma_kernels );
    gContext = g_runtime.get_context();
    
    cl_int err = clblasSetup();
    check_error( err );
    
    g_event = NULL;

    return err;
}
static int setup(void *c) {
  cl_ctx *ctx = (cl_ctx *)c;
  clblasStatus err;

  if (refcnt == 0) {
    err = clblasSetup();
    if (err != clblasSuccess)
      return GA_BLAS_ERROR;
  }

  if (ctx->blas_handle == NULL)
    ctx->blas_handle = &refcnt;
  refcnt++;
  return GA_NO_ERROR;
}
Esempio n. 6
0
void InitJTorch(const bool use_cpu, const uint32_t requested_deviceid,
                const bool verbose_startup) {
  std::lock_guard<std::mutex> lck(cl_context_lock_);
  // Check we haven't already called init.
  RASSERT(cl_context == nullptr);

  if (verbose_startup) {
    std::cout << "Valid OpenCL devices attached:" << std::endl;
    const uint32_t num_devices = jcl::OpenCLContext::printDevices();
    static_cast<void>(num_devices);
  }

  jcl::CLDevice device = use_cpu ? jcl::CLDeviceCPU : jcl::CLDeviceGPU;
  jcl::CLVendor vendor = jcl::CLVendorAny;

  const bool device_exists =
      jcl::OpenCLContext::queryDeviceExists(device, vendor);
  if (!device_exists) {
    if (use_cpu) {
      std::cerr << "No CPU devices attached.";
    } else {
      std::cerr << "No GPU devices attached.";
    }
  }
  RASSERT(device_exists);

  // Otherwise, initialize the context.
  cl_context.reset(new jcl::OpenCLContext());
  cl_context->init(device, jcl::CLVendorAny, verbose_startup);

  // Make sure the user is requesting a device id that exists.
  RASSERT(requested_deviceid < cl_context->getNumDevices());
  deviceid = requested_deviceid;

  std::cout << "Jtorch is using device " << deviceid << ": "
            << cl_context->getDeviceName(deviceid) << std::endl;

  // Startup clblas.
  // TODO(tompson): I have NO idea what device ID this will run on.
  const cl_int blas_ret = clblasSetup();
  const bool blas_ok = (blas_ret == CL_SUCCESS);
  if (!blas_ok) {
    std::cout << "ERROR - InitJTorchInternal: clblasSetup returned error: "
              << jcl::OpenCLContext::getErrorString(blas_ret);
  }
  RASSERT(blas_ok);
}
Esempio n. 7
0
cl_int Dtrmm_internal(
  cl_env *env, double *a, double *b, double alpha, clblasSide side, clblasTranspose transA, 
  clblasUplo uplo, clblasDiag diag, int ar, int ac, int br, int bc, int size_a, int size_b)
{
  CHECK(clblasSetup());
  cl_event events[NEVENTS];
  int nevent = 0;
  cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++]));
  cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_WRITE, &(events[nevent++]));

  cl_int err = clblasDtrmm(clblasColumnMajor, side, uplo, transA, diag,
    br, bc, alpha, mem_a, 0, ar, mem_b, 0, br,
    1, &(env->queues[0]), nevent, events, &(events[nevent]));
  CHECK(err);
  events[nevent+1] = *read_mem(env, mem_b, b, size_b, 1, &(events[nevent]));
  CHECK(clWaitForEvents(1, &(events[nevent+1])));
  CHECK(clReleaseMemObject(mem_a));
  CHECK(clReleaseMemObject(mem_b));
  clblasTeardown();
  return CL_SUCCESS;
}
Esempio n. 8
0
void Caffe::SetDevice(const int device_id) {
  std::vector<int> devices;
  devices.push_back(device_id);
  Caffe::SetDevices(devices);

  Get().default_device_context_ = GetDeviceContext(device_id);

  if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) {
#ifdef USE_CUDA
    int current_device;
    CUDA_CHECK(cudaGetDevice(&current_device));
    if (current_device == device_id) {
      return;
    }
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
    CUDA_CHECK(cudaSetDevice(device_id));
    if (Get().cublas_handle_)
      CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_));
    if (Get().curand_generator_) {
      CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_));
    }
    CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_));
    CURAND_CHECK(
        curandCreateGenerator(&Get().curand_generator_,
                              CURAND_RNG_PSEUDO_DEFAULT));
    CURAND_CHECK(
        curandSetPseudoRandomGeneratorSeed(Get().curand_generator_,
                                           cluster_seedgen()));
#endif  // USE_CUDA
  } else {
#ifdef USE_GREENTEA
#ifdef USE_CLBLAS
    clblasSetup();
#endif  // USE_CLBLAS
#endif  // USE_GREENTEA
  }
}
Esempio n. 9
0
cl_int Dsyrk_internal(
  cl_env *env, double *a, double *c, double alpha, double beta,
  clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c)
{
  CHECK(clblasSetup());
  cl_event events[NEVENTS];
  int nevent = 0;
  cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++]));
  cl_mem mem_c;
  if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++]));
  else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL);
  
  int k = transA == clblasNoTrans ? ar : ac;
  cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA, 
    n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n,
    1, &(env->queues[0]), nevent, events, &(events[nevent]));
  CHECK(err);
  events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent]));
  CHECK(clWaitForEvents(1, &(events[nevent+1])));
  CHECK(clReleaseMemObject(mem_a));
  CHECK(clReleaseMemObject(mem_c));
  clblasTeardown();
  return CL_SUCCESS;
}
Esempio n. 10
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY, bufDotP, scratchBuff;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenY*sizeof(cl_float)), NULL, &err);
    // Allocate 1 element space for dotProduct
    bufDotP = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err);
    // Allocate minimum of N elements
    scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasSdot( N, bufDotP, 0, bufX, 0, incx, bufY, 0, incy, scratchBuff,
                                    1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSdot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufDotP, CL_TRUE, 0, sizeof(cl_float),
                                    &dotProduct, 0, NULL, NULL);
        printf("Result dot product: %f\n", dotProduct);
    }

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufDotP);
    clReleaseMemObject(scratchBuff);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 11
0
bool OpenCLManager::Init() {
  if (instance_.initialized_) {
    return true;
  }
  LOG(INFO) << "Initialize OpenCL";
  instance_.Query();
  if ( OpenCLManager::GetNumPlatforms() <= 0 ) {
    LOG(FATAL) << "No OpenCL platforms found.";
    return false;
  }

  // TODO: mechanism for choosing the correct platform.
  instance_.current_platform_index_ = 0;

  std::tr1::shared_ptr<OpenCLPlatform> pf = CurrentPlatform();
  pf->print();

  if (!pf->createContext()) {
    LOG(FATAL) << "failed to create OpenCL context for platform " << pf->name();
    return false;
  }

  std::vector<std::string> cl_files;
  cl_files.push_back("src/caffe/util/OpenCL/math_functions.cl");
  cl_files.push_back("src/caffe/util/OpenCL/gemm.cl");
  cl_files.push_back("src/caffe/util/OpenCL/im2col.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/pooling_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/relu_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/prelu_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/sigmoid_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/tanh_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/dropout_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/bnll_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/contrastive_loss_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/eltwise_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/lrn_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/softmax_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/softmax_loss_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/threshold_layer.cl");
  cl_files.push_back("src/caffe/layers/OpenCL/mvn_layer.cl");

  std::vector<std::string>::iterator it;

  for ( it = cl_files.begin(); it != cl_files.end(); it++ ) {
    if ( !pf->compile(*it) ) {
      LOG(FATAL) << "failed to create to create OpenCL program for platform " << pf->name();
      return false;
    }
  }

  if ( pf->getNumGPUDevices() < 1 ) {
    LOG(FATAL) << "No GPU devices available at platform " << pf->name();
    return false;
  }
  pf->SetCurrentDevice(CL_DEVICE_TYPE_GPU, instance_.device_id_);
  OpenCLDevice& device = pf->CurrentDevice();
  if (!device.createQueue()) {
    LOG(FATAL) << "failed to create OpenCL command queue for device "
               << device.name();
    return false;
  }

  if ( clblasSetup() != CL_SUCCESS ) {
    LOG(FATAL) << "failed to initialize clBlas";
    return false;
  }

  device.print();
  instance_.initialized_ = true;
  return true;
}
Esempio n. 12
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufX, bufY;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A),
                          NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X),
                          NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        N * lda * sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
        N * sizeof(*X), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
        N * sizeof(*Y), Y, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasZhemv(order, uplo, N, alpha, bufA, 0 /*offA */, lda,
												bufX, 0 /*offx*/, incx, beta,
												bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event);

//	blasZhemv(order, uplo, N, alpha, (DoubleComplex*)A, 0, lda, (DoubleComplex*)X, 0, incx, beta, (DoubleComplex*)Y, 0, incy);
//	err = CL_SUCCESS;
	//err = clblasZtrmv(order, uplo, clblasNoTrans, clblasNonUnit, N, bufA, 0 /*offA */, lda,
    //                                      bufX, 0 /*offx*/, incx,
    //                                      bufY, 1, &queue, 0, NULL, &event);

   	if (err != CL_SUCCESS) {
        printf("clblasZhemv() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);
		printResult();
        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y),
                                  Y, 0, NULL, NULL);
        /* At this point you will get the result of SSYMV placed in Y array. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 13
0
int main( void )
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufB, bufC;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    queue = clCreateCommandQueue( ctx, device, 0, &err );

    /* Setup clBLAS */
    err = clblasSetup( );

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                           NULL, &err );
    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                           NULL, &err );
    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                           NULL, &err );

    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
                                M * K * sizeof( *A ), A, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
                                K * N * sizeof( *B ), B, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
                                M * N * sizeof( *C ), C, 0, NULL, NULL );

    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
                       M, N, K,
                       alpha, bufA, 0, lda,
                       bufB, 0, ldb, beta,
                       bufC, 0, ldc,
                       1, &queue, 0, NULL, &event );

    /* Wait for calculations to be finished. */
    err = clWaitForEvents( 1, &event );

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
                               M * N * sizeof(*result),
                               result, 0, NULL, NULL );

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufC );
    clReleaseMemObject( bufB );
    clReleaseMemObject( bufA );

    /* Finalize work with clBLAS */
    clblasTeardown( );

    /* Release OpenCL working objects. */
    clReleaseCommandQueue( queue );
    clReleaseContext( ctx );

    return ret;
}
Esempio n. 14
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufAP, bufX, bufY;
    cl_event event = NULL;
    int ret = 0, numElementsAP;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    numElementsAP = (N * (N+1)) / 2;	// To get number of elements in a packed matrix

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_float)),
                            NULL, &err);
    bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
                            NULL, &err);
	bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float),
						    NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0,
                numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0,
                N * sizeof(cl_float), X, 0, NULL, NULL);
	err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0,
		        N * sizeof(cl_float), Y, 0, NULL, NULL);

    err = clblasSspr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy,
						        bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event);

   	if (err != CL_SUCCESS) {
        printf("clblasSspr2() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)),
                                  AP, 0, NULL, NULL);
        /* At this point you will get the result of SSPR2 placed in A array. */
        printResult();
    }


    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufX);
    clReleaseMemObject(bufAP);
	clReleaseMemObject(bufY);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 15
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufB, bufC;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    // print device name
    int valueSize=0;
    clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize);                                                                                                        
    char * value = (char*) malloc(valueSize);                                                                                                                                       
    clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL);                                                                                                     
    printf("Device: %sn\n", value);                                                                                                                                 
    free(value);                                                                                                                                                             

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                          NULL, &err);
    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                          NULL, &err);
    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        M * K * sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
        K * N * sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
        M * N * sizeof(*C), C, 0, NULL, NULL);

    /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
    err = clblasSgemm(order, transA, transB, M - off, N - off, K - off,
                         alpha, bufA, offA, lda,
                         bufB, offB, ldb, beta,
                         bufC, offC, ldc,
                         1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSgemmEx() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                                  M * N * sizeof(*result),
                                  result, 0, NULL, NULL);

        /* At this point you will get the result of SGEMM placed in 'result' array. */
        puts("");
        printResult("clblasSgemmEx result");
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 16
0
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA,
                        const void *inMatrixB, int nrowB, int ncolB, bool transposeB,
                        double alpha, double beta, void *outMatrix, bool use_float)
{
    std::stringstream result;
    
    float *input_matrixA_f = (float *)inMatrixA;
    float *input_matrixB_f = (float *)inMatrixB;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrixA_d = (double *)inMatrixA;
    double *input_matrixB_d = (double *)inMatrixB;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;
    
    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }
    
    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrixA = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixA:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(float), input_matrixA_f, &err);
            
        } else {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(double), input_matrixA_d, &err);
        }
    }
    
    cl_mem cl_input_matrixB = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixB:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(float), input_matrixB_f, &err);
            
        } else {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(double), input_matrixB_d, &err);
        }
    }
    
    int nrowC = transposeA ? ncolA : nrowA;
    int ncolC = transposeB ? nrowB : ncolB;
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }
        
        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(double), output_matrix_d, &err);
        }
        
    }
    
    // ++++++++++++
    const int lda = nrowA;  // first dimension of A (rows), before any transpose
    const int ldb = nrowB;  // first dimension of B (rows), before any transpose
    const int ldc = nrowC;      // first dimension of C (rows)
    
    const int M = transposeA ? ncolA : nrowA;    // rows in A (after transpose, if any) and C
    const int N = transposeB ? nrowB : ncolB;    // cols in B (after transpose, if any) and C
    const int K = transposeA ? nrowA : ncolA;    // cols in A and rows in B (after transposes, if any)
    
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans;
    const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSgemm:" << std::endl;
            }
            
            status = clblasSgemm(order, transA, transB, M, N, K,
                              alpha, cl_input_matrixA, 0, lda,
                              cl_input_matrixB, 0, ldb, beta,
                              cl_output_matrix, 0, ldc,
                              1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }
            
        } else {
            if (debug) {
                result << "clblasDgemm:" << std::endl;
            }
            
            status = clblasDgemm(order, transA, transB, M, N, K,
                                 alpha, cl_input_matrixA, 0, lda,
                                 cl_input_matrixB, 0, ldb, beta,
                                 cl_output_matrix, 0, ldc,
                                 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrixA);
    cl_input_matrixA = NULL;
    
    clReleaseMemObject(cl_input_matrixB);
    cl_input_matrixB = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
    return errorStatus;
}
Esempio n. 17
0
SpatialSEIR::OCLProvider::OCLProvider()
{
    lssCout << "Setting up OpenCL Interface\n";
    try
    {
        // Allocate space for platforms and current config
        platforms = new std::vector<PlatformContainer*>();
        currentPlatform = new cl::Platform*;
        currentContext = new cl::Context*;
        currentDevice = new DeviceContainer*;

        R_star_args = new FC_R_Star_KernelData(); 
        R_star_args -> totalWorkUnits = -1;

        p_se_args = new P_SE_Calculation_KernelData();
        p_se_args -> totalWorkUnits = -1;

        // Allocate space for kernels
        test_kernel = new cl::Kernel();
        R_Star_kernel = new cl::Kernel();
        p_se_kernel1 = new cl::Kernel();
        p_se_kernel2 = new cl::Kernel();


        // Build platforms, devices, contexts
        cl_uint i;
        std::vector<cl::Platform> *pformVec = new std::vector<cl::Platform>;
        cl::Platform::get(pformVec);

        PlatformContainer* newPlatform;
        for (i = 0; i < pformVec -> size(); i++) 
        {
            newPlatform = new PlatformContainer((&(*pformVec)[i]));
            platforms -> push_back(newPlatform);
        }


        // Initialize clBLAS library
        
        clblasStatus err = clblasSetup();
        if (err != CL_SUCCESS)
        {
            lssCout << "Error setting up clBLAS library: " << err << "\n";
            throw(-1);
        }

        // Flag for existence of current<item>s
        isSetup = new int; *isSetup = 0;


    }
    catch(cl::Error e)
    {
        cout << "Problem getting platforms:" << endl;
        cout << e.what() << ": Error Code " << e.err() << endl;
        throw(-1);
    }

    // Create Kernels
    // Dummy code to pick device 0,0
    setDevice(0,0);
    buildKernels();
    test();
}
Esempio n. 18
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufX, bufY;
    cl_event event = NULL;
    int ret = 0;
	int lenX = 1 + (N-1)*abs(incx);
	int lenY = 1 + (N-1)*abs(incy);

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
    bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);

	printResult();

    /* Call clblas function. */
    err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
//	printf("here\n");
    if (err != CL_SUCCESS) {
        printf("clblasSrot() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
                                    Y, 0, NULL, NULL);
        err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
                                    X, 0, NULL, NULL);

        /* At this point you will get the result of SROT placed in vector Y. */
        printResult();
    }
	//printf("here\n");

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufY);
    clReleaseMemObject(bufX);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 19
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufC, bufB;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
                          NULL, &err);
    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B),
                          NULL, &err);
    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
                          NULL, &err);

    if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL))
    {
        printf("Failed to create buffern");
        return 1;
    }
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        N * K * sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
        N * K * sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
        N * N * sizeof(*C), C, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb,
                            beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event);

    if (err != CL_SUCCESS) {
        printf("clblasCher2k() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C),
                                  C, 0, NULL, NULL);

        /* At this point you will get the result of SSYRK placed in C array. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);
    
    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
Esempio n. 20
0
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float)
{
    std::stringstream result;
    
    float *input_matrix_f = (float *)inMatrix;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrix_d = (double *)inMatrix;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;

    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }

    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrix:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(float), input_matrix_f, &err);
            
        } else {
            cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                             nrow * ncol * sizeof(double), input_matrix_d, &err);
        }
    }
    
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }

        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              ncol * ncol * sizeof(double), output_matrix_d, &err);
        }

    }
    
    // ++++++++++++
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = clblasTrans;
    
    const size_t lda = nrow;
    const size_t ldc = ncol;
    
    const cl_float alpha = 1.0;
    
    clblasUplo uplo = clblasUpper;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSsyrk:" << std::endl;
            }
            
            status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }

        } else {
            if (debug) {
                result << "clblasDsyrk:" << std::endl;
            }
            
            status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
                                 cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL);
            symmetrizeSquare_f(output_matrix_f, ncol);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL);
            symmetrizeSquare_d(output_matrix_d, ncol);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrix);
    cl_input_matrix = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
//    return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err);
    return errorStatus;
}
Esempio n. 21
0
int
main(int argc, char *argv[])
{
    clMath::Config cfg;
    cfg.setDefaultConfig("ktest.cfg");
    if (!cfg.parseCommandLine(argc, argv) || !cfg.isSane()) {
        return 1;
    }

    clblasSetup();
    parseEnvImplementation();

    clMath::Step *masterStep = getMasterStep(cfg.blasFunctionID(),
                                          cfg.platform(), cfg.device());
    if (masterStep == NULL) {
        std::cerr << "Function support not implemented yet" << std::endl;
        return 1;
    }

    CLBlasKargs kargs;
    SubproblemDim subdims[MAX_SUBDIMS];

    cfg.kargs(&kargs);
    masterStep->setKargs(kargs);
    masterStep->fixLD();

    ListHead seq;
    listInitHead(&seq);
    bool severalKernels = false;

    /* Single kernel for this function */
    if (cfg.decomposition(subdims)) {
        masterStep->setDecomposition(subdims);
    }
    masterStep->completeDecompositionSingle();

    if (cfg.permitMultiKernels()) {
        masterStep->makeSolutionSequence(&seq,
                                         getPlatform(cfg.platform().c_str()));
        if (listLength(&seq) > 1) {
            severalKernels = true;
        }
    }

    if (severalKernels) {
        std::ofstream fs;
        ListNode *node;

        std::vector<clMath::Step*> steps;

        masterStep->declareVars(NULL);

        for (node = listNodeFirst(&seq); node != &seq; node = node->next) {
            steps.push_back(getStep(node));
        }

        std::string str;

        for (unsigned int i = 0; i < steps.size(); i++) {
            std::stringstream kernelFileName;
            kernelFileName << i << "_" << steps[i]->getBlasFunctionName()
                                       << "_" << cfg.cl();

            steps[i]->setKernelName(kernelFileName.str());
            if (cfg.decomposition(subdims)) {
                steps[i]->setDecomposition(subdims);
            }

            steps[i]->completeDecompositionSingle();

            steps[i]->declareVars(masterStep);

            std::cout << "Generating '" << steps[i]->kernelName()
                                        << "' ..." << std::endl;

            str = steps[i]->generate();
            if (str.empty()) {
                std::cerr << "failed" << std::endl;
                abort();
            }
            fs.open(kernelFileName.str().c_str());
            fs << str;
            fs.close();
        }

        clMath::KTest *ktest = new clMath::KTest(masterStep, &steps, &cfg);

        std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl;
        str = ktest->generate(cfg.withAccuracy());
        if (str.empty()) {
            std::cerr << "failed" << std::endl;
            abort();
        }
        fs.open(cfg.cpp().c_str());
        fs << str;
        fs.close();

        delete ktest;

        for (std::vector<clMath::Step*>::iterator it = steps.begin();
                it != steps.end(); ++it) {
            delete (*it);
        }
        steps.clear();
    }
    else {

        std::ofstream fs;

        masterStep->setKernelName(cfg.cl());

        std::cout << "Generating '" << masterStep->kernelName()
                                    << "' ..." << std::endl;

        masterStep->declareVars(NULL);

        std::string str;
        str = masterStep->generate();
        if (str.empty()) {
            std::cerr << "failed" << std::endl;
            abort();
        }
        fs.open(cfg.cl().c_str());
        fs << str;
        fs.close();

        clMath::KTest *ktest = new clMath::KTest(masterStep, &cfg);

        std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl;
        str = ktest->generate(cfg.withAccuracy());
        if (str.empty()) {
            std::cerr << "failed" << std::endl;
            abort();
        }
        fs.open(cfg.cpp().c_str());
        fs << str;
        fs.close();

        delete ktest;
    }

    if (cfg.permitMultiKernels()) {
        masterStep->freeSolutionSequence(&seq);
    }

    delete masterStep;

    return 0;
}
Esempio n. 22
0
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) {

  // Prints the header of the output table
  PrintTableHeader(args);

  // Initializes OpenCL and the libraries
  auto platform = Platform(args.platform_id);
  auto device = Device(platform, args.device_id);
  auto context = Context(device);
  auto queue = Queue(context, device);
  #ifdef CLBLAST_REF_CLBLAS
    if (args.compare_clblas) { clblasSetup(); }
  #endif

  // Iterates over all "num_step" values jumping by "step" each time
  auto s = size_t{0};
  while(true) {

    // Sets the buffer sizes (routine-specific)
    set_sizes(args);

    // Populates input host matrices with random data
    std::vector<T> x_source(args.x_size);
    std::vector<T> y_source(args.y_size);
    std::vector<T> a_source(args.a_size);
    std::vector<T> b_source(args.b_size);
    std::vector<T> c_source(args.c_size);
    std::vector<T> ap_source(args.ap_size);
    std::vector<T> scalar_source(args.scalar_size);
    PopulateVector(x_source);
    PopulateVector(y_source);
    PopulateVector(a_source);
    PopulateVector(b_source);
    PopulateVector(c_source);
    PopulateVector(ap_source);
    PopulateVector(scalar_source);

    // Creates the matrices on the device
    auto x_vec = Buffer<T>(context, args.x_size);
    auto y_vec = Buffer<T>(context, args.y_size);
    auto a_mat = Buffer<T>(context, args.a_size);
    auto b_mat = Buffer<T>(context, args.b_size);
    auto c_mat = Buffer<T>(context, args.c_size);
    auto ap_mat = Buffer<T>(context, args.ap_size);
    auto scalar = Buffer<T>(context, args.scalar_size);
    x_vec.Write(queue, args.x_size, x_source);
    y_vec.Write(queue, args.y_size, y_source);
    a_mat.Write(queue, args.a_size, a_source);
    b_mat.Write(queue, args.b_size, b_source);
    c_mat.Write(queue, args.c_size, c_source);
    ap_mat.Write(queue, args.ap_size, ap_source);
    scalar.Write(queue, args.scalar_size, scalar_source);
    auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar};

    // Runs the routines and collects the timings
    auto timings = std::vector<std::pair<std::string, double>>();
    auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast");
    timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast));
    if (args.compare_clblas) {
      auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS");
      timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas));
    }
    if (args.compare_cblas) {
      auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS");
      timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas));
    }

    // Prints the performance of the tested libraries
    PrintTableRow(args, timings);

    // Makes the jump to the next step
    ++s;
    if (s >= args.num_steps) { break; }
    args.m += args.step;
    args.n += args.step;
    args.k += args.step;
    args.a_ld += args.step;
    args.b_ld += args.step;
    args.c_ld += args.step;
  }

  // Cleans-up and returns
  #ifdef CLBLAST_REF_CLBLAS
    if (args.compare_clblas) { clblasTeardown(); }
  #endif
}
Esempio n. 23
0
void CLHelper::Init(unsigned int platform_number, unsigned int device_number) {
#ifdef BUILD_OPENCL
  cl_uint platform_count = 0;
  clGetPlatformIDs ( 0, 0, &platform_count );

  if ( platform_count == 0 ) {
    FATAL ( "No OpenCL platforms detected!" );
  }

  cl_platform_id* platform_ids = new cl_platform_id[platform_count];
  clGetPlatformIDs ( platform_count, platform_ids, NULL );

  cl_uint device_count = 0;
  clGetDeviceIDs ( platform_ids[platform_number], CL_DEVICE_TYPE_ALL, 0,
                   NULL, &device_count );

  if ( device_count == 0 ) {
    FATAL ( "No OpenCL devices detected!" );
  }

  cl_device_id* device_ids = new cl_device_id[device_count];
  clGetDeviceIDs ( platform_ids[platform_number], CL_DEVICE_TYPE_ALL,
                   device_count, device_ids, NULL );

  char device_name_buffer[256];
  clGetDeviceInfo ( device_ids[device_number], CL_DEVICE_NAME, 256,
                    device_name_buffer, 0 );

  uint32_t support_buffer;
  clGetDeviceInfo ( device_ids[device_number], CL_DEVICE_IMAGE_SUPPORT, 4,
                    &support_buffer, 0 );

  LOGINFO << "Using OpenCL device: " << device_name_buffer;
  LOGDEBUG << "Image support: " << ( support_buffer ? "Yes" : "No" );

  device = device_ids[device_number];

  // Create context
  const cl_context_properties context_properties [] = {
    CL_CONTEXT_PLATFORM,
    reinterpret_cast<cl_context_properties> ( platform_ids [platform_number] ),
    0, 0
  };

  LOGDEBUG << "Creating OpenCL context...";

  cl_int error = 0;
  context = clCreateContext ( context_properties, 1,
                              &device_ids[device_number], 0, 0, &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating OpenCL context: " << error );
  }

  // Create command queue
  LOGDEBUG << "Creating OpenCL command queue...";
  queue = clCreateCommandQueue ( context, device_ids[device_number], 0, &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating OpenCL command queue: " << error );
  }

  delete[] device_ids;
  delete[] platform_ids;

  // Compile kernels
  cl_program p_crossCorrelation = CreateProgram ( "kernels/crossCorrelation.cl" );
  cl_program p_biasedConvolution = CreateProgram ( "kernels/biasedConvolution.cl" );
  cl_program p_fullConvolution = CreateProgram ( "kernels/fullConvolution.cl" );
  cl_program p_foldWeights = CreateProgram ( "kernels/foldWeights.cl" );
  cl_program p_biasedMatrixVector = CreateProgram ( "kernels/biasedMatrixVector.cl" );
  cl_program p_biasGradient = CreateProgram ( "kernels/biasGradient.cl" );
  cl_program p_matrixMatrix = CreateProgram ( "kernels/matrixMatrix.cl" );
  cl_program p_maximum = CreateProgram ( "kernels/maximumPooling.cl" );
  cl_program p_amaximum = CreateProgram ( "kernels/advmaximumPooling.cl" );
  cl_program p_nonLinearFunctions = CreateProgram ( "kernels/nonLinearFunctions.cl" );
  cl_program p_scaling = CreateProgram ( "kernels/scaling.cl" );
  cl_program p_setValue = CreateProgram ( "kernels/setValue.cl" );
  cl_program p_sms = CreateProgram ( "kernels/sms.cl" );
  cl_program p_im2col = CreateProgram ( "kernels/im2col.cl" );

  k_crossCorrelation = clCreateKernel ( p_crossCorrelation, "CROSS_CORRELATION", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasedConvolution = clCreateKernel ( p_biasedConvolution, "BIASED_CONVOLUTION", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_fullConvolution = clCreateKernel ( p_fullConvolution, "FULL_CONVOLUTION", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_foldWeights = clCreateKernel ( p_foldWeights, "FOLD_WEIGHTS", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasedMatrixVector = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_FWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasedMatrixVectorGrad = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_GRAD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasedMatrixVectorBackward = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_BWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasGradientPart1 = clCreateKernel ( p_biasGradient, "BIAS_GRADIENT_PART1", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_biasGradientPart2 = clCreateKernel ( p_biasGradient, "BIAS_GRADIENT_PART2", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_matrixMatrix = clCreateKernel ( p_matrixMatrix, "MATRIX_MATRIX", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_maximumForward = clCreateKernel ( p_maximum, "MAXIMUM_POOLING_FWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_maximumBackward = clCreateKernel ( p_maximum, "MAXIMUM_POOLING_BWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_amaximumForward = clCreateKernel ( p_amaximum, "AMAXIMUM_POOLING_FWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_amaximumBackward = clCreateKernel ( p_amaximum, "AMAXIMUM_POOLING_BWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_nlTanh = clCreateKernel ( p_nonLinearFunctions, "NL_TANH_FWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_nlTanhBackward = clCreateKernel ( p_nonLinearFunctions, "NL_TANH_BWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_nlSigm = clCreateKernel ( p_nonLinearFunctions, "NL_SIGM_FWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }

  k_nlSigmBackward = clCreateKernel ( p_nonLinearFunctions, "NL_SIGM_BWD", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_setValue = clCreateKernel ( p_setValue, "SET_VALUE", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_sms = clCreateKernel ( p_sms, "SMS", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_im2col = clCreateKernel ( p_im2col, "IM2COL", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_col2im = clCreateKernel ( p_im2col, "COL2IM", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_up = clCreateKernel ( p_scaling, "UP", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
  
  k_down = clCreateKernel ( p_scaling, "DOWN", &error );

  if ( error != CL_SUCCESS ) {
    FATAL ( "Error creating kernel: " << ( signed int ) error );
  }
#ifdef BUILD_CLBLAS
  cl_int err = clblasSetup();
  if (err!=CL_SUCCESS)
    FATAL("Call to clblasSetup failed. Error: " << err);
#endif

#endif

}