int float_copyHost2GPU(PGM_Matriz_Double *host,  float *device, int device_col, PGM_Matriz_Float *work){

	int i, j;
	double *ptr;
	for(i = 0; i < host->n_linhas; i++){
		ptr = host->valor+(i*host->n_colunas);

		for( j = 0; j < host->n_colunas; j++)
			work->valor[j] = (float) ptr[j];

        for(; j < work->n_colunas; j++)
            work->valor[j] = 0;

		if(cublasSetVector(work->n_colunas, sizeof(float), work->valor, 1, device+(device_col*i), 1) != CUBLAS_STATUS_SUCCESS){
			printf("Error: nao foi possivel executar a copia\n");
			return -1;
		}
	}

	for ( i = 0; i < work->n_colunas; i++) work->valor[i] = 0;

	for ( ; i < work->n_colunas; j++){
        if(cublasSetVector(work->n_colunas, sizeof(float), work->valor, 1, device+(device_col*i), 1) != CUBLAS_STATUS_SUCCESS){
			printf("Error: nao foi possivel executar a copia\n");
			return -1;
		}
	}
	return CUBLAS_STATUS_SUCCESS;
}
void cuda_dot(const cublasHandle_t blasHandle, const int n, const T x[], int incX, const T y[], int incY, T* result, DOT dot)
{
	T *d_X = NULL;
	T *d_Y = NULL;
	cudaMalloc((void**)&d_X, n*sizeof(T));
	cudaMalloc((void**)&d_Y, n*sizeof(T));

	cublasSetVector(n, sizeof(T), x, incX, d_X, incX);
	cublasSetVector(n, sizeof(T), y, incY, d_Y, incY);

	dot(blasHandle, n, d_X, incX, d_Y, incY, result);

	cudaFree(d_X);
	cudaFree(d_Y);
}
void
mexFunction( int nlhs, mxArray *plhs[],
             int nrhs,  const mxArray *prhs[] )
{

  int status;
  int N;
  int SIZE;
  void * h_A;
  void * d_A;
  int incx;
  int incy;

  if (nrhs!=6)
    mexErrMsgTxt("Wrong number of arguments");


  N         =  (int) mxGetScalar(prhs[0]);
  SIZE      =  (int) mxGetScalar(prhs[1]);
  h_A       =  (void *) mxGetPr(prhs[2]);
  incx      =  (int) mxGetScalar(prhs[3]);
  d_A       =  (void *) (UINTPTR mxGetScalar(prhs[4]));
  incy      =  (int) mxGetScalar(prhs[5]);

  status = cublasSetVector(N, SIZE, h_A, incx, d_A, incy);

  plhs[0] = mxCreateDoubleScalar(status);

}
void cuda_axpy(const cublasHandle_t blasHandle, const int n, const T alpha, const T x[], int incX, T y[], int incY, AXPY axpy)
{
	T *d_X = NULL;
	T *d_Y = NULL;
	cudaMalloc((void**)&d_X, n*sizeof(T));
	cudaMalloc((void**)&d_Y, n*sizeof(T));

	cublasSetVector(n, sizeof(T), x, incX, d_X, incX);
	cublasSetVector(n, sizeof(T), y, incY, d_Y, incY);

	axpy(blasHandle, n, &alpha, d_X, incX, d_Y, incX);

	cublasGetVector(n, sizeof(T), d_Y, incY, y, incY);

	cudaFree(d_X);
	cudaFree(d_Y);
}
void CudaUtil::cublasCheckSetVector(int n, int elemSize, void* hostPtr, int incx, void* devicePtr, int incy, int line, const char* file)
{
	cublasStatus_t status = cublasSetVector(n, elemSize, hostPtr, incx, devicePtr, incy);
	if (status != CUBLAS_STATUS_SUCCESS) {
		std::ostringstream os;
		os << "CUBALS device write error, line " << line << ", in file " << file;
		throw CudaException(os.str());
	}
}
Exemple #6
0
SEXP d_setVector(SEXP v)
{
	int n = length(v);
	double * d_v;

	cublasAlloc(n, sizeof(double), (void **)&d_v);
	cublasSetVector(n, sizeof(double), REAL(v), 1, d_v, 1);
	checkCublasError("d_setVector");
	
	return packVector(n, d_v);
}
Exemple #7
0
void magma_setvector(
    magma_int_t n, size_t elemSize,
    void const* hx_src, magma_int_t incx,
    void*       dy_dst, magma_int_t incy )
{
    cublasStatus_t status;
    status = cublasSetVector(
        n, elemSize,
        hx_src, incx,
        dy_dst, incy );
    check_error( status );
}
Exemple #8
0
void magma_setvector_internal(
    magma_int_t n, magma_int_t elemSize,
    void const* hx_src, magma_int_t incx,
    void*       dy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetVector(
        n, elemSize,
        hx_src, incx,
        dy_dst, incy );
    check_xerror( status, func, file, line );
}
void solve_system_on_gpu(gpu_symm_band_matrix gpu_matrix, double * b, cublasHandle_t handle)
{
	double * d_b;
	checkCudaErrors(cudaMalloc(&d_b, gpu_matrix.order*sizeof(double)));
	checkCublasErrors(cublasSetVector(gpu_matrix.order, sizeof(double), b, 1, d_b, 1));
	
	solve_lower_system_on_gpu(gpu_matrix, d_b, handle);
	solve_upper_system_on_gpu(gpu_matrix, d_b, handle);

	checkCublasErrors(cublasGetVector(gpu_matrix.order, sizeof(double), d_b, 1, b, 1));

	checkCudaErrors(cudaFree(d_b));
}
Exemple #10
0
void magma_ssetvector_internal(
    magma_int_t n,
    float const* hx_src, magma_int_t incx,
    float*       dy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetVector(
        n, sizeof(float),
        hx_src, incx,
        dy_dst, incy );
    check_xerror( status, func, file, line );
}
Exemple #11
0
void cuda_scal(const cublasHandle_t blasHandle, const int n, const T alpha, T x[], int incX, SCAL scal)
{
	T *d_X = NULL;
	cudaMalloc((void**)&d_X, n*sizeof(T));

	cublasSetVector(n, sizeof(T), x, incX, d_X, incX);

	scal(blasHandle, n, &alpha, d_X, incX);

	cublasGetVector(n, sizeof(T), d_X, incX, x, incX);

	cudaFree(d_X);
}
Exemple #12
0
// ========================================
// copying vectors
extern "C" void
magma_zsetvector_internal(
    magma_int_t n,
    magmaDoubleComplex const* hx_src, magma_int_t incx,
    magmaDoubleComplex_ptr    dy_dst, magma_int_t incy,
    const char* func, const char* file, int line )
{
    cublasStatus_t status;
    status = cublasSetVector(
        n, sizeof(magmaDoubleComplex),
        hx_src, incx,
        dy_dst, incy );
    check_xerror( status, func, file, line );
}
Exemple #13
0
long benchmark(int size) {
    long requestStart, requestEnd;
    int incx = 1, incy = 1, n = size;
    double *cuA, *cuB;
    cublasStatus status;


    double* a = random_array(size);
    double* b = random_array(size);

    status = cublasAlloc(n, sizeof(double),(void**)&cuA);
    checkStatus("A", status);
    status = cublasAlloc(n, sizeof(double),(void**)&cuB);
    checkStatus("B", status);

    status = cublasSetVector(n, sizeof(double), a, incx, cuA, incx);
    checkStatus("setA", status);

    status = cublasSetVector(n, sizeof(double), b, incy, cuB, incy);
    checkStatus("setB", status);

    requestStart = currentTimeNanos();

    cublasDdot(n, cuA, incx, cuB, incy);

    requestEnd = currentTimeNanos();

    status = cublasFree(cuA);
    checkStatus("freeA", status);
    status = cublasFree(cuB);
    checkStatus("freeB", status);

    free(a);
    free(b);

    return (requestEnd - requestStart);
}
void mat_add_mat(const double *x, double *y, double scalar, int n){
	cudaError_t cudaStat ; // cudaMalloc status
	cublasStatus_t stat ; // CUBLAS functions status
	cublasHandle_t handle ; // CUBLAS context	
// on the device
	double *d_x; // d_x - x on the device
	double *d_y; // d_y - y on the device

	cudaStat = cudaMalloc (( void **)& d_x, n*sizeof(*x)); // device
	// memory alloc for x
	cudaStat = cudaMalloc (( void **)& d_y, n*sizeof(*y)); // device
	// memory alloc for y
	stat = cublasCreate (& handle ); // initialize CUBLAS context
	stat = cublasSetVector (n, sizeof (*x), x ,1 ,d_x, 1); // cp x- >d_x
	stat = cublasSetVector (n, sizeof (*y), y ,1 ,d_y, 1); // cp y- >d_y

	stat=cublasDaxpy(handle,n,&scalar,d_x,1,d_y,1);


	cudaFree (d_x ); // free device memory
	cudaFree (d_y ); // free device memory
	cublasDestroy ( handle ); // destroy CUBLAS context

}
Exemple #15
0
double magma_get_norm_sy(SEXP obj, const char *typstr)
{
#ifdef HIPLAR_WITH_MAGMA
	char typnm[] = {'\0', '\0'};
	int *dims = INTEGER(GET_SLOT(obj, Matrix_DimSym));
	double *work = (double *) NULL;
	int N = dims[0];
	int lda = N;
	double *A = REAL(GET_SLOT(obj, Matrix_xSym));
	typnm[0] = La_norm_type(typstr);

	const char *c = uplo_P(obj);

	//Magmablas dlansy only does I & M norms
	if(GPUFlag == 1 && (*typnm == 'I' || *typnm == 'M')) {
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing norm using magmablas_dlansy"); 
#endif
		double *dwork, *d_A, maxnorm;
		cublasAlloc(N, sizeof(double), (void**)&dwork);
		cublasAlloc(lda * N, sizeof(double), (void**)&d_A);
		cublasSetVector(N * lda, sizeof(double), A, 1, d_A, 1);
		maxnorm = magmablas_dlansy(typnm[0], *c ,N, d_A, lda, dwork);
		cublasFree(d_A);
		cublasFree(dwork);
		return maxnorm;
	}
	else {

		if (*typnm == 'I' || *typnm == 'O') {
			work = (double *) R_alloc(dims[0], sizeof(double));
		}

		return F77_CALL(dlansy)(typnm, uplo_P(obj),
				dims, A,
				dims, work);
	}
#endif
	return 0.0;
}
Exemple #16
0
static
double magma_get_norm(SEXP obj, const char *typstr)
{
#ifdef HIPLAR_WITH_MAGMA
	if(any_NA_in_x(obj))
		return NA_REAL;
	else {
		char typnm[] = {'\0', '\0'};
		int *dims = INTEGER(GET_SLOT(obj, Matrix_DimSym));
		double *work = (double *) NULL;

		typnm[0] = La_norm_type(typstr);
		if (*typnm == 'I') {
			work = (double *) R_alloc(dims[0], sizeof(double));
			if(GPUFlag == 1 && (dims[0] % 64 == 0) && (dims[1] % 64 == 0)) {
#ifdef HIPLAR_DBG
	R_ShowMessage("DBG: Getting norm using magmablas_dlange");
#endif
				double *d_work, *d_A, *A, val;
				A = REAL(GET_SLOT(obj, Matrix_xSym));
				cublasAlloc(dims[0] * dims[1], sizeof(double), (void**)&d_A);
				cublasAlloc(dims[0], sizeof(double), (void**)&d_work);
				cublasSetVector(dims[0] * dims[1], sizeof(double), A, 1, d_A, 1);
				val = magmablas_dlange(*typstr, dims[0], dims[1], d_A, dims[0], d_work);
				cudaFree(d_A);
				cudaFree(d_work);
				return val;
			}

		}
		return F77_CALL(dlange)(typstr, dims, dims+1,
				REAL(GET_SLOT(obj, Matrix_xSym)),
				dims, work);
	}
#endif

	return 0.0;
}
Exemple #17
0
/* Main */
int test_cublas(void)
{    
    cublasStatus status;
    cudaError_t e;
    float* h_A;
    float* h_B;
    float* h_C;
    float* h_C_ref;
    float* d_A = 0;
    void *vp;
    float* d_B = 0;
    float* d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float*)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (float*)malloc(n2 * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    d_A = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    d_B = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
    d_C = (float *) vp;

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }
    
    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Clear last error */
    cublasGetError();

    /* Performs operation using cublas */
    cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);
    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    
    /* Allocate host memory for reading back the result from device memory */
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (i = 0; i < n2; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    e = cudaFree(d_A);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_B);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_C);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
                  int nrhs, const mxArray *prhs[]) {

    if (nrhs != 7) {
        mexErrMsgTxt("sgemm requires 7 input arguments");
    } else if (nlhs != 1) {
        mexErrMsgTxt("sgemm requires 1 output argument");
    }

    if ( !mxIsSingle(prhs[4]) ||
            !mxIsSingle(prhs[5]) ||
            !mxIsSingle(prhs[6]))   {
        mexErrMsgTxt("Input arrays must be single precision.");
    }

    int ta = (int) mxGetScalar(prhs[0]);
    int tb = (int) mxGetScalar(prhs[1]);
    float alpha = (float) mxGetScalar(prhs[2]);
    float beta = (float) mxGetScalar(prhs[3]);
    float *h_A = (float*) mxGetData(prhs[4]);
    float *h_B = (float*) mxGetData(prhs[5]);
    float *h_C = (float*) mxGetData(prhs[6]);

    int M = mxGetM(prhs[4]);   /* gets number of rows of A */
    int K = mxGetN(prhs[4]);   /* gets number of columns of A */
    int L = mxGetM(prhs[5]);   /* gets number of rows of B */
    int N = mxGetN(prhs[5]);   /* gets number of columns of B */

    cublasOperation_t transa, transb;
    int MM, KK, NN;
    if (ta == 0) {
        transa = CUBLAS_OP_N;
        MM=M;
        KK=K;
    } else {
        transa = CUBLAS_OP_T;
        MM=K;
        KK=M;
    }

    if (tb == 0) {
        transb = CUBLAS_OP_N;
        NN=N;
    } else {
        transb = CUBLAS_OP_T;
        NN=L;
    }

    /*	printf("transa=%c\n",transa);
    	printf("transb=%c\n",transb);
    	printf("alpha=%f\n",alpha);
    	printf("beta=%f\n",beta);	*/

    /* Left hand side matrix set up */
    mwSize dims0[2];
    dims0[0]=MM;
    dims0[1]=NN;
    plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
    float *h_C_out = (float*) mxGetData(plhs[0]);

    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! CUBLAS initialization error\n");
    }

    float* d_A = 0;
    float* d_B = 0;
    float* d_C = 0;

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n");
    }
    if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n");

    }
    if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) {
        mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n");
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write A)\n");

    }
    status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write B)\n");
    }
    status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (write C)\n");
    }

    /* Performs operation using cublas */
    status = cublasSgemm(handle, transa, transb, MM, NN, KK, &alpha, d_A, M, d_B, L, &beta, d_C, MM);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! kernel execution error.\n");
    }

    /* Read the result back */
    status = cublasGetVector(MM * NN, sizeof(h_C[0]), d_C, 1, h_C_out, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! device access error (read C)\n");

    }

    if (cudaFree(d_A) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (A)\n");
    }
    if (cudaFree(d_B) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (B)\n");
    }
    if (cudaFree(d_C) != cudaSuccess) {
        mexErrMsgTxt("!!!! memory free error (C)\n");
    }

    /* Shutdown */
    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS) {
        mexErrMsgTxt("!!!! shutdown error (A)\n");
    }
}
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;
    cublasHandle_t handle;

    int dev = findCudaDevice(argc, (const char **) argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Performs operation using cublas */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;

    for (i = 0; i < n2; ++i)
    {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }

    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);

    if (fabs(ref_norm) < 1e-7)
    {
        fprintf(stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    printf("CUBLAS program finished\n");

    exit(error_norm / ref_norm < 1e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
}
Exemple #20
0
SEXP magma_dgeMatrix_LU_(SEXP x, Rboolean warn_sing)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "LU");
	int *dims, npiv, info;

	if (val != R_NilValue) {
//		R_ShowMessage("already in slot");	/* nothing to do if it's there in 'factors' slot */
		return val;
	}

	dims = INTEGER(GET_SLOT(x, Matrix_DimSym));
	if (dims[0] < 1 || dims[1] < 1)
		error(_("Cannot factor a matrix with zero extents"));
	npiv = (dims[0] < dims[1]) ? dims[0] : dims[1];
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("denseLU")));
	slot_dup(val, x, Matrix_xSym);
	slot_dup(val, x, Matrix_DimSym);
	double *h_R = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(ALLOC_SLOT(val, Matrix_permSym, INTSXP, npiv));
	
	if(GPUFlag == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using dgetrf;");
#endif
		F77_CALL(dgetrf)(dims, dims + 1, h_R,
				dims,
				ipiv,
				&info);
	}
	else if(GPUFlag == 1 && Interface == 0){
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: LU decomposition using magma_dgetrf;");
#endif
			magma_dgetrf(dims[0], dims[1], h_R, dims[0], ipiv, &info);
		}
		else if(GPUFlag == 1 && Interface == 1) {
			
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: LU decomposition using magma_dgetrf_gpu;");	
#endif
			double *d_A;
			int N2 = dims[0] * dims[1];
			cublasStatus retStatus;

			cublasAlloc( N2 , sizeof(double), (void**)&d_A);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), h_R, 1, d_A, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/

			magma_dgetrf_gpu(dims[0],dims[1], d_A, dims[0], ipiv,  &info);
			
			cublasGetVector( N2, sizeof(double), d_A, 1, h_R, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
				cublasFree(d_A);
		
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing data"));
			/********************************************/
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		
	if (info < 0)
		error(_("Lapack routine %s returned error code %d"), "dgetrf", info);
	else if (info > 0 && warn_sing)
		warning(_("Exact singularity detected during LU decomposition: %s, i=%d."),
				"U[i,i]=0", info);
	UNPROTECT(1);

	return set_factors(x, val, "LU");
#endif

	    return R_NilValue;
}
Exemple #21
0
SEXP magma_dpoMatrix_solve(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(x);
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix")));
    int *dims = INTEGER(GET_SLOT(x, Matrix_DimSym)), info;

    SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
    slot_dup(val, Chol, Matrix_uploSym);
    slot_dup(val, Chol, Matrix_xSym);
    slot_dup(val, Chol, Matrix_DimSym);
    SET_SLOT(val, Matrix_DimNamesSym,
	     duplicate(GET_SLOT(x, Matrix_DimNamesSym)));
    double *A = REAL(GET_SLOT(val, Matrix_xSym));
		int N = *dims;	
		int lda = N;
		const char *uplo = uplo_P(val);
		
		if(GPUFlag == 0) {
			
			F77_CALL(dpotri)(uplo_P(val), dims, A, dims, &info);
		
		}
		else if(GPUFlag == 1 && Interface == 0) {
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri");
#endif
			magma_dpotri(uplo[0], N, A, lda, &info);
		}
		else if(GPUFlag == 1 && Interface == 1){
			double *d_A;
			cublasStatus retStatus;
			cublasAlloc( N * lda , sizeof(double), (void**)&d_A);
#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Solving using magma_dpotri_gpu");
#endif		
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N * lda, sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotri_gpu(uplo[0], N, d_A, lda, &info);

			cublasGetVector(N * lda, sizeof(double), d_A, 1, val, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/
			
			cublasFree(d_A);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
	
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}
		
		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
Exemple #22
0
SEXP magma_dgeMatrix_crossprod(SEXP x, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dpoMatrix"))),
	     nms = VECTOR_ELT(GET_SLOT(x, Matrix_DimNamesSym), tr ? 0 : 1),
	     vDnms = ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	int *Dims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *vDims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	int k = tr ? Dims[1] : Dims[0], n = tr ? Dims[0] : Dims[1];
	double *vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n)),
		 one = 1.0, zero = 0.0;
	double *A =  REAL(GET_SLOT(x, Matrix_xSym));
	AZERO(vx, n * n);
	SET_SLOT(val, Matrix_uploSym, mkString("U"));
	ALLOC_SLOT(val, Matrix_factorSym, VECSXP, 0);
	vDims[0] = vDims[1] = n;
	SET_VECTOR_ELT(vDnms, 0, duplicate(nms));
	SET_VECTOR_ELT(vDnms, 1, duplicate(nms));
	if(n && GPUFlag == 1) {

#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing crossproduct using cublasDsyrk");
#endif
		cublasStatus retStatus;
		double *d_A, *d_C;

		/*retStatus = cublasCreate(&handle);
		  if ( retStatus != CUBLAS_STATUS_SUCCESS )		
		  error(_("CUBLAS initialisation failed"));
		  */

		cublasAlloc(n * k, sizeof(double), (void**)&d_A);
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(n * n, sizeof(double), (void**)&d_C);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/
		
		//cublasSetVector( n  * n , sizeof(double), vx, 1, d_C, 1);
		
		/* Error Checking */
		//retStatus = cublasGetError ();
		//if (retStatus != CUBLAS_STATUS_SUCCESS) 
		//	error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/


		cublasDsyrk('U' , tr ? 'N' : 'T', n, k, one, d_A, Dims[0], zero, d_C, n);

		cublasGetVector( n * n , sizeof(double), d_C, 1, vx, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_C);

	} else if(n){
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Performing cross prod with dsyrk");
#endif
		F77_CALL(dsyrk)("U", tr ? "N" : "T", &n, &k, &one, A, Dims,
				&zero, vx, &n);
	}

	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
Exemple #23
0
SEXP magma_dgeMatrix_matrix_crossprod(SEXP x, SEXP y, SEXP trans)
{
#ifdef HIPLAR_WITH_MAGMA
	int tr = asLogical(trans);/* trans=TRUE: tcrossprod(x,y) */
	SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *xDims = INTEGER(GET_SLOT(x, Matrix_DimSym)),
	    *yDims = INTEGER(getAttrib(y, R_DimSymbol)),
	    *vDims, nprot = 1;
	int m  = xDims[!tr],  n = yDims[!tr];/* -> result dim */
	int xd = xDims[ tr], yd = yDims[ tr];/* the conformable dims */
	double one = 1.0, zero = 0.0;

	if (isInteger(y)) {
		y = PROTECT(coerceVector(y, REALSXP));
		nprot++;
	}
	if (!(isMatrix(y) && isReal(y)))
		error(_("Argument y must be a numeric matrix"));
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	SET_SLOT(val, Matrix_DimSym, allocVector(INTSXP, 2));
	vDims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	if (xd > 0 && yd > 0 && n > 0 && m > 0) {
		if (xd != yd)
			error(_("Dimensions of x and y are not compatible for %s"),
					tr ? "tcrossprod" : "crossprod");
		vDims[0] = m; vDims[1] = n;
		SET_SLOT(val, Matrix_xSym, allocVector(REALSXP, m * n));

		double *A = REAL(GET_SLOT(x, Matrix_xSym));
		double *B = REAL(y);
		double *C = REAL(GET_SLOT(val, Matrix_xSym));

		if(GPUFlag == 1) {
			
			double *d_A, *d_B, *d_C;
			cublasStatus retStatus;

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix crossprod using magmablas_dgemm");
#endif
			cublasAlloc(m * xd, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(n * xd, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( m  * xd , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( xd * n, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( m * n, sizeof(double), C, 1, d_C, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/


			// ******** magmablas_dgemm call Here **
			//magmablas_dgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			//CHANGE
			cublasDgemm( tr ? 'N' : 'T', tr ? 'T' : 'N', m, n, xd, one, d_A, xDims[0], d_B, yDims[0], zero, d_C,  m);
			cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
			cublasFree(d_C);
			
		}
		else {

#ifdef HIPLAR_DBG
			R_ShowMessage("DBG: Performing dge/matrix cross prod with dgemm");
#endif
			F77_CALL(dgemm)(tr ? "N" : "T", tr ? "T" : "N", &m, &n, &xd, &one,
					A , xDims,
					B , yDims,
					&zero, C, &m);
		}
	}
	UNPROTECT(nprot);
	return val;
#endif
	return R_NilValue;
}
/* Main */
int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *h_C_rnd;
    float *h_C_ref;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    cublasHandle_t handle;

    int dev_id;
    cudaDeviceProp device_prop;

    bool do_device_api_test = false;

    float host_api_test_ratio, device_api_test_ratio;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running...\n");

    dev_id = findCudaDevice(argc, (const char **) argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));

    if ((device_prop.major << 4) + device_prop.minor >= 0x35)
    {
        printf("Host and device APIs will be tested.\n");
        do_device_api_test = true;
    }
    /*    else if ((device_prop.major << 4) + device_prop.minor >= 0x20)
        {
            printf("Host API will be tested.\n");
            do_device_api_test = false;
        }
    */
    else
    {
        fprintf(stderr, "simpleDevLibCUBLAS examples requires Compute Capability of SM 3.5 or higher\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_SUCCESS;
    }

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_C_rnd = (float *)malloc(n2 * sizeof(h_C_rnd[0]));

    if (h_C_rnd == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C_rnd)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    h_C = (float *)malloc(n2 * sizeof(h_C_ref[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++)
    {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C_rnd[i] = rand() / (float)RAND_MAX;
        h_C[i] = h_C_rnd[i];
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /*
     * Performs operation using plain C code
     */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /*
     * Performs operation using cublas
     */
    status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Allocate host memory for reading back the result from device memory */
    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    host_api_test_ratio = check_result(h_C, h_C_ref, n2);

    if (do_device_api_test)
    {
        /* Reset device resident C matrix */
        status = cublasSetVector(n2, sizeof(h_C_rnd[0]), h_C_rnd, 1, d_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (write C)\n");

            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            return EXIT_FAILURE;
        }

        /*
         * Performs operation using the device API of CUBLAS library
         */
        device_cublas_sgemm(N, alpha, d_A, d_B, beta, d_C);

        /* Read the result back */
        status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

        if (status != CUBLAS_STATUS_SUCCESS)
        {
            fprintf(stderr, "!!!! device access error (read C)\n");

            // cudaDeviceReset causes the driver to clean up all state. While
            // not mandatory in normal operation, it is good practice.  It is also
            // needed to ensure correct operation when the application is being
            // profiled. Calling cudaDeviceReset causes all profile data to be
            // flushed before the application exits
            cudaDeviceReset();
            return EXIT_FAILURE;
        }

        /* Check result against reference */
        device_api_test_ratio = check_result(h_C, h_C_ref, n2);
    }

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_rnd);
    free(h_C_ref);

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        return EXIT_FAILURE;
    }

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    bool test_result = do_device_api_test ?
                       host_api_test_ratio < 1e-6 &&
                       device_api_test_ratio < 1e-6 :
                       host_api_test_ratio < 1e-6;

    printf("simpleCUBLAS completed, returned %s\n",
           test_result ? "OK" : "ERROR!");

    exit(test_result ? EXIT_SUCCESS : EXIT_FAILURE);
}
Exemple #25
0
SEXP magma_dpoMatrix_dgeMatrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP Chol = magma_dpoMatrix_chol(a),
			 val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
			info;

	/* Checking Matrix Dimensions */
	if (adims[1] != bdims[0])
		error(_("Dimensions of system to be solved are inconsistent"));
	if (adims[0] < 1 || bdims[1] < 1)
		error(_("Cannot solve() for matrices with zero extents"));
	/* ****************************************** */
	
	SET_SLOT(val, Matrix_factorSym, allocVector(VECSXP, 0));
	slot_dup(val, b, Matrix_DimSym);
	slot_dup(val, b, Matrix_xSym);

	double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
	double *B = REAL(GET_SLOT(val, Matrix_xSym));

	if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
		double *d_A, *d_B;
		const char *uplo = uplo_P(Chol);
		magma_int_t NRHS = bdims[1];
		magma_int_t lda	 = adims[1];
		magma_int_t ldb  = bdims[0];
		magma_int_t N 	 = adims[0];
		cublasStatus retStatus;

		/*if(uplo == "U")
			uplo = MagmaUpperStr;
		else if(uplo == "L")
			uplo = MagmaLowerStr;
		else		
			uplo = MagmaUpperStr;
		*/

		cublasAlloc(N * lda, sizeof(double), (void**)&d_A);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation"));
		/********************************************/

		cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer to Device"));
		/********************************************/

		magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

		cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);
		
		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Data Transfer from Device"));
		/********************************************/
		
		cublasFree(d_A);
		cublasFree(d_B);
	}
	else {
	
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs;");
#endif
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1, A , adims, B , bdims, &info);
	}
	if (info) {
		if(info > 0)
			error(_("the leading minor of order %d is not positive definite"),
					info);
		else /* should never happen! */
			error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
	}
	UNPROTECT(1);
	return val;
#endif
	return R_NilValue;
}
Exemple #26
0
SEXP magma_dgeMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = PROTECT(dup_mMatrix_as_dgeMatrix(b)),
			 lu = PROTECT(magma_dgeMatrix_LU_(a, TRUE));
	int *adims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
			*bdims = INTEGER(GET_SLOT(val, Matrix_DimSym));
	int info, n = bdims[0], nrhs = bdims[1];



	if (*adims != *bdims || bdims[1] < 1 || *adims < 1 || *adims != adims[1])
		error(_("Dimensions of system to be solved are inconsistent"));

	double *A = REAL(GET_SLOT(lu, Matrix_xSym));
	double *B  = REAL(GET_SLOT(val, Matrix_xSym));
	int *ipiv = INTEGER(GET_SLOT(lu, Matrix_permSym));

	if(GPUFlag == 0) {
		F77_CALL(dgetrs)("N", &n, &nrhs, A, &n, ipiv, B, &n, &info);	
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using dgetrs;");
#endif
	}else if(GPUFlag == 1) {
		
	
#ifdef HIPLAR_DBG
		R_ShowMessage("DBG: Solve using LU using magma_dgetrs;");
#endif		
		double *d_A, *d_B;
		cublasStatus retStatus;

		cublasAlloc(adims[0] * adims[1], sizeof(double), (void**)&d_A);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of A on Device"));
		/********************************************/


		cublasAlloc(n * nrhs, sizeof(double), (void**)&d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Memory Allocation of b on Device"));
		/********************************************/



		cublasSetVector(adims[0] * adims[1], sizeof(double), A, 1, d_A, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring data to advice"));
		/********************************************/

		cublasSetVector(n * nrhs, sizeof(double), B, 1, d_B, 1);

		magma_dgetrs_gpu( 'N', n, nrhs, d_A, n, ipiv, d_B, n, &info );

		cublasGetVector(n * nrhs, sizeof(double), d_B, 1, B, 1);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in Transferring from to advice"));
		/********************************************/

		cublasFree(d_A);
		cublasFree(d_B);

		/* Error Checking */
		retStatus = cublasGetError ();
		if (retStatus != CUBLAS_STATUS_SUCCESS) 
			error(_("CUBLAS: Error in freeing data"));
		/********************************************/

		
	}
	if (info)
		error(_("Lapack routine dgetrs: system is exactly singular"));
	UNPROTECT(2);
	return val;
#endif
	    return R_NilValue;
}
Exemple #27
0
SEXP magma_dpoMatrix_matrix_solve(SEXP a, SEXP b)
{
#ifdef HIPLAR_WITH_MAGMA
    SEXP Chol = magma_dpoMatrix_chol(a),
	val = PROTECT(duplicate(b));
    int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	*bdims = INTEGER(getAttrib(b, R_DimSymbol)),
	info;

    if (!(isReal(b) && isMatrix(b)))
	error(_("Argument b must be a numeric matrix"));
    if (*adims != *bdims || bdims[1] < 1 || *adims < 1)
	error(_("Dimensions of system to be solved are inconsistent"));
    
		double *A = REAL(GET_SLOT(Chol, Matrix_xSym));
		double *B = REAL(val);
		//const char *uplo = uplo_P(Chol);
		//int N = bdims[1];
		//There is only a GPU interface for this call
		//so it will be the default setting if the GPU is on
		if(GPUFlag == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Solving system of Ax = b, A = dpo, b = dge, using dpotrs_gpu;");
#endif
			double *d_A, *d_B;
			const char *uplo = uplo_P(Chol);
			magma_int_t NRHS = bdims[1];
			magma_int_t lda	 = adims[1];
			magma_int_t ldb  = bdims[0];
			magma_int_t N 	 = adims[0];
			cublasStatus retStatus;
			cublasAlloc(N * lda, sizeof(double), (void**)&d_A);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasAlloc(N * NRHS, sizeof(double), (void**)&d_B);	

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector( N  * lda , sizeof(double), A, 1, d_A, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			cublasSetVector( ldb * NRHS, sizeof(double), B, 1, d_B, 1 );

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer to Device"));
			/********************************************/

			magma_dpotrs_gpu(uplo[0], N, NRHS , d_A, lda, d_B, ldb, &info);

			cublasGetVector( ldb * NRHS, sizeof(double), d_B, 1, B, 1);

			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Data Transfer from Device"));
			/********************************************/

			cublasFree(d_A);
			cublasFree(d_B);
		}
		else {
		F77_CALL(dpotrs)(uplo_P(Chol), adims, bdims + 1,
		     REAL(GET_SLOT(Chol, Matrix_xSym)), adims,
		     REAL(val), bdims, &info);
		}
		// Error checking of MAGMA/LAPACK calls
		if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

		UNPROTECT(1);
    return val;
#endif
	return R_NilValue;
}
Exemple #28
0
SEXP magma_dpoMatrix_chol(SEXP x)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP val = get_factors(x, "Cholesky"),
			 dimP = GET_SLOT(x, Matrix_DimSym),
			 uploP = GET_SLOT(x, Matrix_uploSym);

	const char *uplo = CHAR(STRING_ELT(uploP, 0));
	int *dims = INTEGER(dimP), info;
	int n = dims[0];
	double *vx;
	cublasStatus retStatus;
	if (val != R_NilValue) return val;
	dims = INTEGER(dimP);
	val = PROTECT(NEW_OBJECT(MAKE_CLASS("Cholesky")));
	SET_SLOT(val, Matrix_uploSym, duplicate(uploP));
	SET_SLOT(val, Matrix_diagSym, mkString("N"));
	SET_SLOT(val, Matrix_DimSym, duplicate(dimP));
	vx = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, n * n));
	AZERO(vx, n * n);
	
	//we could put in magmablas_dlacpy but it only
	//copies all of the matrix 
	F77_CALL(dlacpy)(uplo, &n, &n, REAL(GET_SLOT(x, Matrix_xSym)), &n, vx, &n);
	if (n > 0) {

		if(GPUFlag == 0){
#ifdef HIPLAR_DBG	
		R_ShowMessage("DBG: Cholesky decomposition using dpotrf;");
#endif
			F77_CALL(dpotrf)(uplo, &n, vx, &n, &info);
		}
		else if(GPUFlag == 1 && Interface == 0){
		
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf;");
#endif			
			int nrows, ncols;
			nrows = ncols = n;

			magma_int_t lda;
			lda = nrows;

			magma_dpotrf(uplo[0], ncols, vx, lda, &info);

			/* Error Checking */
			retStatus = cudaGetLastError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf"));
			/********************************************/
			

		}
		else if(GPUFlag == 1 && Interface == 1) {
	
#ifdef HIPLAR_DBG	
			R_ShowMessage("DBG: Cholesky decomposition using magma_dpotrf_gpu;");	
#endif
			double *d_c;
			int nrows, ncols;
			nrows = ncols = n;
			int N2 = nrows * ncols;


			magma_int_t lda;
			lda = nrows;

			cublasAlloc(lda * ncols, sizeof(double), (void**)&d_c);
			
			/* Error Checking */
			retStatus = cublasGetError ();
			if (retStatus != CUBLAS_STATUS_SUCCESS) 
				error(_("CUBLAS: Error in Memory Allocation"));
			/********************************************/

			cublasSetVector(N2, sizeof(double), vx, 1, d_c, 1);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer to Device"));
			/********************************************/


			magma_dpotrf_gpu(uplo[0], ncols, d_c, lda, &info);
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in magma_dpotrf_gpu"));
			/********************************************/
			

			cublasGetVector(nrows * ncols, sizeof(double), d_c, 1, vx, 1);		
			
			/* Error Checking */
			retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Date Transfer from Device"));
			/********************************************/
			
			cublasFree(d_c);
		}
		else
			error(_("MAGMA/LAPACK/Interface Flag not defined correctly"));
		}
		
	if (info) {
			if(info > 0)
				error(_("the leading minor of order %d is not positive definite"),
						info);
			else /* should never happen! */
				error(_("Lapack routine %s returned error code %d"), "dpotrf", info);
		}

	UNPROTECT(1);
	return set_factors(x, val, "Cholesky");
#endif
	return R_NilValue;
}
Exemple #29
0
SEXP magma_dgeMatrix_solve(SEXP a)
{
#ifdef HIPLAR_WITH_MAGMA
    /*  compute the 1-norm of the matrix, which is needed
	later for the computation of the reciprocal condition number. */
    double aNorm = magma_get_norm(a, "1");

    /* the LU decomposition : */
		/* Given that we may be performing this operation
		 * on the GPU we may put in an optimisation here
		 * where if we call the LU solver we, we do not require
		 * the decomposition to be transferred back to CPU. This is TODO
		 */
    SEXP val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix"))),
	lu = magma_dgeMatrix_LU_(a, TRUE);
    int *dims = INTEGER(GET_SLOT(lu, Matrix_DimSym)),
	*pivot = INTEGER(GET_SLOT(lu, Matrix_permSym));

    /* prepare variables for the dgetri calls */
    double *x, tmp;
    int	info, lwork = -1;

    if (dims[0] != dims[1]) error(_("Solve requires a square matrix"));
    slot_dup(val, lu, Matrix_xSym);
    x = REAL(GET_SLOT(val, Matrix_xSym));
    slot_dup(val, lu, Matrix_DimSym);
		int N2 = dims[0] * dims[0];

    if(dims[0]) /* the dimension is not zero */
    {
			/* is the matrix is *computationally* singular ? */
			double rcond;
			F77_CALL(dgecon)("1", dims, x, dims, &aNorm, &rcond,
					(double *) R_alloc(4*dims[0], sizeof(double)),
					(int *) R_alloc(dims[0], sizeof(int)), &info);
			if (info)
				error(_("error [%d] from Lapack 'dgecon()'"), info);
			if(rcond < DOUBLE_EPS)
				error(_("Lapack dgecon(): system computationally singular, reciprocal condition number = %g"),
						rcond);

			/* only now try the inversion and check if the matrix is *exactly* singular: */
			// This is also a work space query. This is not an option in magma

			F77_CALL(dgetri)(dims, x, dims, pivot, &tmp, &lwork, &info);
			lwork = (int) tmp;
			
			if( GPUFlag == 0){
				

				F77_CALL(dgetri)(dims, x, dims, pivot,
						(double *) R_alloc((size_t) lwork, sizeof(double)),
						&lwork, &info);

#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Solve using LU using dgetri;");
#endif
			}
			else if(GPUFlag == 1) {
				
				double *d_x, *dwork; 
				cublasStatus retStatus;			
	
#ifdef HIPLAR_DBG
				R_ShowMessage("Solve using LU using magma_dgetri;");
#endif
				cublasAlloc(N2, sizeof(double), (void**)&d_x);

				//cublasAlloc(N2 , sizeof(double), (void**)&dtmp);
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				cublasSetVector( N2, sizeof(double), x, 1, d_x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/
				lwork = dims[0] * magma_get_dgetri_nb( dims[0] );
				
					cublasAlloc(lwork, sizeof(double), (void**)&dwork);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation on Device"));
				/********************************************/

				magma_dgetri_gpu(dims[0], d_x, dims[0], pivot, dwork , lwork, &info);

				cublasGetVector(N2, sizeof(double), d_x, 1, x, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data From to Device"));
				/********************************************/

				cublasFree(dwork);
				cublasFree(d_x);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error freeing memory"));
				/********************************************/
					
			}
			else
				error(_("GPUFlag not set correctly"));

			if (info)
				error(_("Lapack routine dgetri: system is exactly singular"));
    }
    UNPROTECT(1);
    return val;
#endif
	    return R_NilValue;
}
Exemple #30
0
SEXP magma_dgeMatrix_matrix_mm(SEXP a, SEXP bP, SEXP right)
{
#ifdef HIPLAR_WITH_MAGMA
	SEXP b = PROTECT(mMatrix_as_dgeMatrix(bP)),
	     val = PROTECT(NEW_OBJECT(MAKE_CLASS("dgeMatrix")));
	int *adims = INTEGER(GET_SLOT(a, Matrix_DimSym)),
	    *bdims = INTEGER(GET_SLOT(b, Matrix_DimSym)),
	    *cdims = INTEGER(ALLOC_SLOT(val, Matrix_DimSym, INTSXP, 2));
	double one = 1.0, zero = 0.0;

	if (asLogical(right)) {
		int m = bdims[0], n = adims[1], k = bdims[1];
		if (adims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		if (m < 1 || n < 1 || k < 1) {
			// 		This was commented out
				    error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			double *B = REAL(GET_SLOT(b, Matrix_xSym));
			double *A = REAL(GET_SLOT(a, Matrix_xSym));
			double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
			//TODO add magma here too
			if(GPUFlag == 1) {
				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;


#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication with Right = true using magmablas_dgemm");
#endif
				cublasAlloc(n * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( n  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( m * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C,  m);
				//CHANGED 30/07
				cublasDgemm('N', 'N', m, n, k, one, d_B, m, d_A, k, zero, d_C, m);
				
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in cublasDgemm routine"));
				}
				/********************************************/

				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm with right = TRUE");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						B, &m, A , &k, &zero, C , &m);
			}
		}
	} else {
		int m = adims[0], n = bdims[1], k = adims[1];
		double *A = REAL(GET_SLOT(a, Matrix_xSym));
		double *B = REAL(GET_SLOT(b, Matrix_xSym));


		if (bdims[0] != k)
			error(_("Matrices are not conformable for multiplication"));
		cdims[0] = m; cdims[1] = n;
		double *C = REAL(ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n));
		
		if (m < 1 || n < 1 || k < 1) {
			//		This was commented out
			error(_("Matrices with zero extents cannot be multiplied")); 
			ALLOC_SLOT(val, Matrix_xSym, REALSXP, m * n);
		} else {
			if(GPUFlag == 1) {

				double *d_A, *d_B, *d_C;
				cublasStatus retStatus;

	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using magmablas_dgemm");
#endif			
				cublasAlloc(m * k, sizeof(double), (void**)&d_A);
			
				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(n * k, sizeof(double), (void**)&d_B);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasAlloc(m * n, sizeof(double), (void**)&d_C);	

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Memory Allocation"));
				/********************************************/

				cublasSetVector( m  * k , sizeof(double), A, 1, d_A, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				cublasSetVector( n * k, sizeof(double), B, 1, d_B, 1 );

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer to Device"));
				/********************************************/

				// ******** magmablas_dgemm call Here **
				//magmablas_dgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C,  m);
				//CHANGE
				cublasDgemm('N', 'N', m, n, k, one, d_A, m, d_B, k, zero, d_C, m);
				
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) {
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/
				}
				
				cublasGetVector( m * n , sizeof(double), d_C, 1, C, 1);

				/* Error Checking */
				retStatus = cublasGetError ();
				if (retStatus != CUBLAS_STATUS_SUCCESS) 
					error(_("CUBLAS: Error in Data Transfer from Device"));
				/********************************************/

				cublasFree(d_A);
				cublasFree(d_B);
				cublasFree(d_C);
				
			}
			else {
	
#ifdef HIPLAR_DBG
				R_ShowMessage("DBG: Performing matrix multiplication using dgemm");
#endif
				F77_CALL(dgemm) ("N", "N", &m, &n, &k, &one,
						A, &m,
						B, &k, &zero,
						C,
						&m);	

			}
		}
	}
	ALLOC_SLOT(val, Matrix_DimNamesSym, VECSXP, 2);
	UNPROTECT(2);
	return val;
#endif
	return R_NilValue;
}