コード例 #1
0
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, float beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                  M, N, K,
                                  alpha, A->buf, offA, lda, B->buf, offB, ldb,
                                  beta, C->buf, offC, ldc, 1, &ctx->q,
                                  num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
コード例 #2
0
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event evl[3];
  cl_event ev;
  size_t i;
  cl_uint num_ev = 0;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                    M, N, K,
                                    alpha, A[i]->buf, offA[i], lda,
                                    B[i]->buf, offB[i], ldb,
                                    beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
                                    num_ev, num_ev == 0 ? NULL : evl, &ev));
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
コード例 #3
0
ファイル: clfunc_xgemm.hpp プロジェクト: AndreasMiller/clBLAS
void 
xGemm<cl_float>::
xGemm_Function(bool flush, cl_uint apiCallCount )
{
  for (unsigned int i = 0; i < numQueues; i++) {
    events_[i] = NULL;
  }
	for (unsigned int i = 0; i < apiCallCount; i++)
	{
		clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
			buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
			buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
			buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
			buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
			buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_);
	}
	//flush==true if only the kernel time (library call) is timed
	//flush==false if memory time is also timed
	if (flush==true)
	{
    // check if any valid events returned
    cl_uint numValidEvents = 0;
    for (unsigned int i = 0; i < numQueuesToUse; i++) {
      if (events_[i]) {
        cl_uint clReferenceCount;
        cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL);
        if ( err == CL_SUCCESS) {
          //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount );
          numValidEvents++;
        } else {
          //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err );
        }
      } else {
        //printf("events[%u/%u] is NULL\n", i, numQueuesToUse );
      }
    }
    
    for (unsigned int i = 0; i < numQueuesToUse; i++) {
      clFlush(queues_[i]);
    }
		clWaitForEvents(numValidEvents, events_);
	}
}
コード例 #4
0
ファイル: matmul.c プロジェクト: cirosantilli/cpp-cheat
void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_event event;
    size_t mat_sizeof;

    mat_sizeof = n * n * sizeof(F);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clblasSgemm(
        clblasRowMajor,
        clblasNoTrans,
        clblasNoTrans,

        n,
        n,
        n,
        1.0,

        cache->buf_a,
        0,
        n,

        cache->buf_b,
        0,
        n,

        0.0,
        cache->buf_c,
        0,
        n,

        1,
        &(cache->common.command_queue),
        0,
        NULL,
        &event
    );
    clWaitForEvents(1, &event);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}
コード例 #5
0
ファイル: example_sgemm.c プロジェクト: liangfu/helloworlds
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufB, bufC;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    // print device name
    int valueSize=0;
    clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize);                                                                                                        
    char * value = (char*) malloc(valueSize);                                                                                                                                       
    clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL);                                                                                                     
    printf("Device: %sn\n", value);                                                                                                                                 
    free(value);                                                                                                                                                             

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                          NULL, &err);
    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                          NULL, &err);
    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                          NULL, &err);

    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        M * K * sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
        K * N * sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
        M * N * sizeof(*C), C, 0, NULL, NULL);

    /* Call clblas extended function. Perform gemm for the lower right sub-matrices */
    err = clblasSgemm(order, transA, transB, M - off, N - off, K - off,
                         alpha, bufA, offA, lda,
                         bufB, offB, ldb, beta,
                         bufC, offC, ldc,
                         1, &queue, 0, NULL, &event);
    if (err != CL_SUCCESS) {
        printf("clblasSgemmEx() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
                                  M * N * sizeof(*result),
                                  result, 0, NULL, NULL);

        /* At this point you will get the result of SGEMM placed in 'result' array. */
        puts("");
        printResult("clblasSgemmEx result");
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);

    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}
コード例 #6
0
ファイル: TensorMath.cpp プロジェクト: imclab/cn24
void TensorMath::GEMM(const bool is_row_major, const bool transpose_A, const bool transpose_B, const int M, const int N, const int K, const datum alpha, const Conv::Tensor &A, const int smA, const int ldA, const Conv::Tensor &B, const int smB, const int ldB, const datum beta, Conv::Tensor &C, const int smC, const int ldC)
{
#ifdef BUILD_CLBLAS
  ((Tensor&)A).MoveToGPU();
  ((Tensor&)B).MoveToGPU();
  C.MoveToGPU(C.hint_ignore_content_ && beta == 0.0);
  
  cl_event done_event = NULL;
  
  const int offA = A.width() * A.height() * A.maps() * smA;
  const int offB = B.width() * B.height() * B.maps() * smB;
  const int offC = C.width() * C.height() * C.maps() * smC;
  
  cl_int err =
    clblasSgemm(is_row_major ? clblasRowMajor : clblasColumnMajor,
    transpose_A ? clblasTrans : clblasNoTrans,
    transpose_B ? clblasTrans : clblasNoTrans,
    M, N, K, alpha, (cl_mem)A.cl_data_ptr_, offA, ldA,
    (cl_mem)B.cl_data_ptr_, offB, ldB, beta,
    (cl_mem)C.cl_data_ptr_, offC, ldC,
    1, &(CLHelper::queue), 0, NULL, &done_event);
  
  if(err!=CL_SUCCESS)
    FATAL("Call to clblasSgemm failed. Error: " << err);
#else
  
#ifdef BUILD_OPENCL
  ((Tensor&)A).MoveToCPU();
  ((Tensor&)B).MoveToCPU();
  C.MoveToCPU(C.hint_ignore_content_ && beta == 0.0);
#endif 
  
#ifdef BUILD_BLAS
  INNERGEMM(is_row_major ? CblasRowMajor : CblasColMajor,
    transpose_A ? CblasTrans : CblasNoTrans,
    transpose_B ? CblasTrans : CblasNoTrans,
    M, N, K,
    alpha, A.data_ptr_const(0,0,0,smA), ldA,
    B.data_ptr_const(0,0,0,smB), ldB,
    beta, C.data_ptr(0,0,0,smC), ldC);
#else
  if(!is_row_major)
    FATAL("Reference GEMM does not support column-major matrices!");
  
  const datum* a_ptr = A.data_ptr_const(0, 0, 0, smA);
  const datum* b_ptr = B.data_ptr_const(0, 0, 0, smB);
  datum* c_ptr = C.data_ptr(0, 0, 0, smC);
  
  #pragma omp parallel for default(shared)
  for(int i = 0; i < M; i++) {
    for(int j = 0; j < N; j++) {
      datum sum = 0.0;
      for(int k = 0; k < K; k++) {
        const datum a_value = transpose_A ?
          a_ptr[k * ldA + i]
        :
          a_ptr[i * ldA + k];
        
        const datum b_value = transpose_B ?
          b_ptr[j * ldB + k]
        :
          b_ptr[k * ldB + j];
          
        sum += a_value * b_value;
      }
      if(beta == 0.0)
        c_ptr[ldC * i + j] = alpha * sum;
      else
        c_ptr[ldC * i + j] = beta * c_ptr[ldC * i + j] + alpha * sum;
    }
  }
#endif // BUILD_BLAS
#endif // BUILD_CLBLAS
  C.hint_ignore_content_ = false;
}
コード例 #7
0
ファイル: gemm_clblas.cpp プロジェクト: quadrivio/multiblas
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA,
                        const void *inMatrixB, int nrowB, int ncolB, bool transposeB,
                        double alpha, double beta, void *outMatrix, bool use_float)
{
    std::stringstream result;
    
    float *input_matrixA_f = (float *)inMatrixA;
    float *input_matrixB_f = (float *)inMatrixB;
    
    float *output_matrix_f = (float *)outMatrix;
    
    double *input_matrixA_d = (double *)inMatrixA;
    double *input_matrixB_d = (double *)inMatrixB;
    
    double *output_matrix_d = (double *)outMatrix;
    
    if (debug) {
        result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") <<
        ")" << std::endl << std::endl;
    }
    
    cl_int err = CL_SUCCESS;
    
    clblasStatus status = clblasSetup();
    if (status != CL_SUCCESS) {
        if (debug) {
            result << "clblasSetup: " << clblasErrorToString(status) << std::endl;
        }
        
        err = CL_INVALID_OPERATION;
    }
    
    // get first platform
    cl_platform_id platform = NULL;
    if (err == CL_SUCCESS) {
        err = clGetPlatformIDs(1, &platform, NULL);
    }
    
    if (debug && err == CL_SUCCESS) {
        result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl;
        result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl;
    }
    
    // context
    cl_context context = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateContext:" << std::endl;
        }
        
        context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    }
    
    // queue
    cl_command_queue queue = NULL;
    if (err == CL_SUCCESS) {
#ifdef CL_VERSION_2_0
        if (debug) {
            result << "clCreateCommandQueueWithProperties:" << std::endl;
        }
        
        queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
        
#else
        if (debug) {
            result << "clCreateCommandQueue:" << std::endl;
        }
        
        queue = clCreateCommandQueue(context, device, 0, &err);
#endif
    }
    
    // buffers
    cl_mem cl_input_matrixA = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixA:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(float), input_matrixA_f, &err);
            
        } else {
            cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowA * ncolA * sizeof(double), input_matrixA_d, &err);
        }
    }
    
    cl_mem cl_input_matrixB = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_input_matrixB:" << std::endl;
        }
        
        if (use_float) {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(float), input_matrixB_f, &err);
            
        } else {
            cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                              nrowB * ncolB * sizeof(double), input_matrixB_d, &err);
        }
    }
    
    int nrowC = transposeA ? ncolA : nrowA;
    int ncolC = transposeB ? nrowB : ncolB;
    cl_mem cl_output_matrix = NULL;
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "clCreateBuffer cl_output_vector:" << std::endl;
        }
        
        if (use_float) {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(float), output_matrix_f, &err);
            
        } else {
            cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                              nrowC * ncolC * sizeof(double), output_matrix_d, &err);
        }
        
    }
    
    // ++++++++++++
    const int lda = nrowA;  // first dimension of A (rows), before any transpose
    const int ldb = nrowB;  // first dimension of B (rows), before any transpose
    const int ldc = nrowC;      // first dimension of C (rows)
    
    const int M = transposeA ? ncolA : nrowA;    // rows in A (after transpose, if any) and C
    const int N = transposeB ? nrowB : ncolB;    // cols in B (after transpose, if any) and C
    const int K = transposeA ? nrowA : ncolA;    // cols in A and rows in B (after transposes, if any)
    
    const clblasOrder order = clblasColumnMajor;
    const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans;
    const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans;
    
    cl_event event = NULL;
    
    if (err == CL_SUCCESS) {
        if (use_float) {
            if (debug) {
                result << "clblasSgemm:" << std::endl;
            }
            
            status = clblasSgemm(order, transA, transB, M, N, K,
                              alpha, cl_input_matrixA, 0, lda,
                              cl_input_matrixB, 0, ldb, beta,
                              cl_output_matrix, 0, ldc,
                              1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS && debug) {
                result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
            }
            
        } else {
            if (debug) {
                result << "clblasDgemm:" << std::endl;
            }
            
            status = clblasDgemm(order, transA, transB, M, N, K,
                                 alpha, cl_input_matrixA, 0, lda,
                                 cl_input_matrixB, 0, ldb, beta,
                                 cl_output_matrix, 0, ldc,
                                 1, &queue, 0, NULL, &event);
            
            if (status != CL_SUCCESS) {
                if (debug) {
                    result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
                }
                
                err = status;
            }
        }
    }
    
    if (err == CL_SUCCESS) {
        /* Wait for calculations to be finished. */
        if (debug) {
            result << "clWaitForEvents:" << std::endl;
        }
        err = clWaitForEvents(1, &event);
    }
    
    // retrieve result
    if (err == CL_SUCCESS) {
        if (debug) {
            result << "Retrieve result:" << std::endl;
        }
        
        if (use_float) {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL);
            
        } else {
            clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL);
        }
    }
    
    std::string err_str = clErrorToString(err);
    result << std::endl << err_str << std::endl;
    
    // cleanup
    clReleaseMemObject(cl_output_matrix);
    cl_output_matrix = NULL;
    
    clReleaseMemObject(cl_input_matrixA);
    cl_input_matrixA = NULL;
    
    clReleaseMemObject(cl_input_matrixB);
    cl_input_matrixB = NULL;
    
    clReleaseCommandQueue(queue);
    queue = NULL;
    
    clReleaseContext(context);
    context = NULL;
    
    if (debug) {
        CERR << result.str();
    }
    
    ErrorStatus errorStatus = { err, status };
    
    return errorStatus;
}
コード例 #8
0
ファイル: foo.cpp プロジェクト: ConfusedReality/core_hunter
int main( void )
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufB, bufC;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs( 1, &platform, NULL );
    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
    queue = clCreateCommandQueue( ctx, device, 0, &err );

    /* Setup clBLAS */
    err = clblasSetup( );

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
                           NULL, &err );
    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
                           NULL, &err );
    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
                           NULL, &err );

    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
                                M * K * sizeof( *A ), A, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
                                K * N * sizeof( *B ), B, 0, NULL, NULL );
    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
                                M * N * sizeof( *C ), C, 0, NULL, NULL );

    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
                       M, N, K,
                       alpha, bufA, 0, lda,
                       bufB, 0, ldb, beta,
                       bufC, 0, ldc,
                       1, &queue, 0, NULL, &event );

    /* Wait for calculations to be finished. */
    err = clWaitForEvents( 1, &event );

    /* Fetch results of calculations from GPU memory. */
    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
                               M * N * sizeof(*result),
                               result, 0, NULL, NULL );

    /* Release OpenCL memory objects. */
    clReleaseMemObject( bufC );
    clReleaseMemObject( bufB );
    clReleaseMemObject( bufA );

    /* Finalize work with clBLAS */
    clblasTeardown( );

    /* Release OpenCL working objects. */
    clReleaseCommandQueue( queue );
    clReleaseContext( ctx );

    return ret;
}