void 
xHer2k<cl_float2>::roundtrip_func()
{
		timer.Start(timer_id);
        cl_int err;
        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
                                            buffer_.offa_) * sizeof(cl_float2),
                                        NULL, &err);
	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                            buffer_.offb_) * sizeof(cl_float2),
                                        NULL, &err);
        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                            buffer_.offc_) * sizeof(cl_float2),
                                        NULL, &err);
		this->initialize_gpu_buffer();
		clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
				buffer_.N_, buffer_.K_, buffer_.alpha_,
				buffer_.A_, buffer_.offa_, buffer_.lda_, 
				buffer_.B_, buffer_.offb_, buffer_.ldb_,
				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);

		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
                                   buffer_.offc_ * sizeof(cl_float2),
                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
                                       sizeof(cl_float2),
                                   buffer_.cpuC_, 0, NULL, &event_);
		clWaitForEvents(1, &event_);
		timer.Stop(timer_id);
}
Esempio n. 2
0
/** Perform Hermitian rank-2k update.
        \f$ C = \alpha A B^T + \alpha B A^T \beta C \f$ (trans == MagmaNoTrans), or \n
        \f$ C = \alpha A^T B + \alpha B^T A \beta C \f$ (trans == MagmaTrans),      \n
        where \f$ C \f$ is Hermitian.

    @param[in]
    uplo    Whether the upper or lower triangle of C is referenced.

    @param[in]
    trans   Operation to perform on A and B.

    @param[in]
    n       Number of rows and columns of C. n >= 0.

    @param[in]
    k       Number of columns of A and B (for MagmaNoTrans) or rows of A and B (for MagmaTrans). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); \n
            otherwise,                the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      COMPLEX array on GPU device.
            If trans == MagmaNoTrans, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n); \n
            otherwise,                the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      COMPLEX array on GPU device.
            The n-by-n Hermitian matrix C of dimension (lddc,n), lddc >= max(1,n).

    @param[in]
    lddc    Leading dimension of dC.

    @ingroup magma_cblas3
*/
extern "C" void
magma_cher2k(
    magma_uplo_t uplo, magma_trans_t trans,
    magma_int_t n, magma_int_t k,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb,
    float beta,
    magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc,
    magma_queue_t queue )
{
    if (n <= 0 || k <= 0)
        return;

    cl_int err = clblasCher2k(
        clblasColumnMajor,
        clblas_uplo_const( uplo ),
        clblas_trans_const( trans ),
        n, k,
        alpha, dA, dA_offset, ldda,
        dB, dB_offset, lddb,
        beta, dC, dC_offset, lddc,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
void 
xHer2k<cl_float2>::call_func()
{
	timer.Start(timer_id);
	clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
				buffer_.N_, buffer_.K_, buffer_.alpha_,
				buffer_.A_, buffer_.offa_, buffer_.lda_, 
				buffer_.B_, buffer_.offb_, buffer_.ldb_,
				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
    clWaitForEvents(1, &event_);
    timer.Stop(timer_id);
}
Esempio n. 4
0
int
main(void)
{
    cl_int err;
    cl_platform_id platform = 0;
    cl_device_id device = 0;
    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
    cl_context ctx = 0;
    cl_command_queue queue = 0;
    cl_mem bufA, bufC, bufB;
    cl_event event = NULL;
    int ret = 0;

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetPlatformIDs() failed with %d\n", err );
        return 1;
    }

    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
    if (err != CL_SUCCESS) {
        printf( "clGetDeviceIDs() failed with %d\n", err );
        return 1;
    }

    props[1] = (cl_context_properties)platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateContext() failed with %d\n", err );
        return 1;
    }

    queue = clCreateCommandQueue(ctx, device, 0, &err);
    if (err != CL_SUCCESS) {
        printf( "clCreateCommandQueue() failed with %d\n", err );
        clReleaseContext(ctx);
        return 1;
    }

    /* Setup clblas. */
    err = clblasSetup();
    if (err != CL_SUCCESS) {
        printf("clblasSetup() failed with %d\n", err);
        clReleaseCommandQueue(queue);
        clReleaseContext(ctx);
        return 1;
    }

    /* Prepare OpenCL memory objects and place matrices inside them. */
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A),
                          NULL, &err);
    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B),
                          NULL, &err);
    bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C),
                          NULL, &err);

    if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL))
    {
        printf("Failed to create buffern");
        return 1;
    }
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
        N * K * sizeof(*A), A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
        N * K * sizeof(*B), B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
        N * N * sizeof(*C), C, 0, NULL, NULL);

    /* Call clblas function. */
    err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb,
                            beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event);

    if (err != CL_SUCCESS) {
        printf("clblasCher2k() failed with %d\n", err);
        ret = 1;
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);

        /* Fetch results of calculations from GPU memory. */
        err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C),
                                  C, 0, NULL, NULL);

        /* At this point you will get the result of SSYRK placed in C array. */
        printResult();
    }

    /* Release OpenCL events. */
    clReleaseEvent(event);
    
    /* Release OpenCL memory objects. */
    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);

    /* Finalize work with clblas. */
    clblasTeardown();

    /* Release OpenCL working objects. */
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);

    return ret;
}