Ejemplo n.º 1
0
void gpu_cublas1(double *A, double *B, double *C, double *D, double *r, double *nrmC, int N, int N2)
{
	#pragma acc data present(A, B, C, D)
	{
		#pragma acc host_data use_device(A, B, C, D)
		{
			cublasHandle_t handle;
			cublasCreate(&handle);
			const double alpha = 1.0;
			const double beta = 0.0;
			cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, N, N, N, &alpha, A, N, B, N, &beta, C, N);
			printf(" gpu gemm success \n");
			cublasDdot(handle, N2, C, 1, B, 1, r);
			printf(" gpu dot success \n");
			*r = -1.0 * *r;
			cublasDaxpy(handle, N2, r, B, 1, C, 1);
			printf(" gpu axpy success \n");
			cublasDnrm2(handle, N2, C, 1, nrmC);
			printf(" gpu nrm2 success \n");
			cublasDcopy(handle, N2, C, 1, D, 1);
			printf(" gpu copy success \n");
			*nrmC = 1.0 / *nrmC;
			cublasDscal(handle, N2, nrmC, D, 1);
			printf(" gpu scal success \n");
			cublasDestroy(handle);
			printf(" gpu destroy success \n");
		}
	}
}
Ejemplo n.º 2
0
void magma_dcopy(
    magma_int_t n,
    const double *dx, magma_int_t incx,
    double       *dy, magma_int_t incy )
{
    cublasDcopy( n, dx, incx, dy, incy );
}
Ejemplo n.º 3
0
//copy on CPU
//copy on GPU
GPUMat& GPUMat::copyFromCPU(const GPUMat& rhs) {
    // Check for self-assignment!
    if (this != &rhs) {
        delete _data_CPU;
        cudaFree(_data_GPU);
        cudaStat = cudaMalloc((void **)&_data_GPU ,n_elem*sizeof(double));
        cudaStat = cublasDcopy(handle, n_elem, rhs.memptr_GPU(),1, _data_GPU,1);
        // Deallocate, allocate new space, copy values...
    }
    // 1.  Deallocate any memory that MyClass is using internally
    // 2.  Allocate some memory to hold the contents of rhs
    // 3.  Copy the values from rhs into this instance
    // 4.  Return *this
    return *this;
}
Ejemplo n.º 4
0
// Solve A * x = b in GPU.
void cublas_backsolver(double *A, double *x, double *b, int N)
{
	#pragma acc data present(A, x, b)
	{
		#pragma host_data use_device(A, x, b)
		{
			cublasHandle_t h;
			cublasCreate(&h);
			cublasDcopy(h, N, b, 1, x, 1);
//			printf(" cublasDcopy success. \n");
			cublasDtrsv(h, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, N, A, N, x, 1);
//			printf(" cublasDtrsv success. \n");
			cublasDestroy(h);
		}
	}
}
Ejemplo n.º 5
0
Archivo: ardblas.c Proyecto: rforge/gcb
void d_copy(SEXP rx, SEXP rincx, SEXP ry, SEXP rincy)
{
	int
		nx, ny, n,
		incx = asInteger(rincx),
		incy = asInteger(rincy);
	double
		* x, * y;

	unpackVector(rx, &nx, &x);
	unpackVector(ry, &ny, &y);
	n = imin2(nx, ny);

	cublasDcopy(n, x, incx, y, incy);
	checkCublasError("d_copy");
}
Ejemplo n.º 6
0
void
cube_blas_d_copy (cube_t       *ctx,
		  int           n,
		  const double *x, int incx,
		  double       *y, int incy)
{
  cublasStatus_t status;
  
  if (! cube_context_check (ctx))
    return;

  status = cublasDcopy (ctx->h_blas,
			n,
			x, incx,
			y, incy);

  cube_blas_check (ctx, status);
}
Ejemplo n.º 7
0
void caffe_gpu_scale<double>(const int n, const double alpha, const double *x,
                             double* y) {
  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
  CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
}
Ejemplo n.º 8
0
int main(int argc, char* argv[])
{
	const int bufsize = 512;
    	char buffer[bufsize];
	int m,n,S;
	double time_st,time_end,time_avg;
	//omp_set_num_threads(2);
//	printf("\n-----------------\nnumber of threads fired = %d\n-----------------\n",(int)omp_get_num_threads());
	if(argc!=2)
	{
		cout<<"Insufficient arguments"<<endl;
		return 1;
	}
	
	graph G;

	cerr<<"Start reading                    ";
//	time_st=dsecnd();
	G.create_graph(argv[1]);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cout<<"Success              "<<endl;
//	cerr<<"Reading time                     "<<time_avg<<endl;

	cerr<<"Constructing Matrices            ";
//	time_st=dsecnd();
	G.construct_MNA();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

//	G.construct_sparse_MNA();
	m=G.node_array.size()-1;
	n=G.voltage_edge_id.size();
	
	cout<<endl;
	cout<<"MATRIX STAT:"<<endl;
	cout<<"Nonzero elements:               "<<G.nonzero<<endl;
	cout<<"Number of Rows:                 "<<m+n<<endl;
	printf("\n Nonzero = %ld", G.nonzero);
	printf("\n Rows = %d", m+n);

	cout<<"MAT val:		       "<<endl;
	int i,j;

//	G.Mat_val[0] +=100;
/*	
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.Mat_val[i];
	cout<<endl;
	for(i=0;i<G.nonzero;i++)
		cout<<" "<<G.columns[i];
	cout<<endl;	
	for(i=0;i<m+n+1;i++)
		cout<<" "<<G.rowIndex[i];
	cout<<endl;
	
	
	for(i=0;i<m+n;i++)
	{
		cout<<endl;
		int startindex=G.rowIndex[i];
		int endindex=G.rowIndex[i+1];
		for(j=startindex;j<endindex;j++)
			cout<<" "<<G.Mat_val[j];
		cout<<endl;
		
	}
*/
/*	for (i=0;i<m+n+1;i++)
	{
		//cout<<endl;
		if(G.rowIndex[i]==G.rowIndex[i+1])
			break;
		
		for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			if(G.Mat_val[j]>10)
				cout<<G.Mat_val[j]<<"\t";
		}
		//cout<<endl;
		/*for(j=G.rowIndex[i];j<G.rowIndex[i+1];j++)
		{
			cout<<G.columns[j]<<"\t";
		}
		//cout<<endl;
	}
	cout<<endl;
*/

//printing the matrix
	printf("\n Fine till here");
	printf("\n");
//	int* rowmIndex=(int*)calloc(m+1,sizeof(int));
	printf("\n Fine till here");
	printf("\n");
	//int rowmIndex[5]={1,2,3,4,5};
/*	for(i=0;i<m+1;i++)
	{
		rowmIndex[i]=G.rowIndex[i];
		printf(" %d", rowmIndex[i]);
	}
*/
	cerr<<"Solving Equations    "<<endl;


    double r1, b, alpha, alpham1, beta, r0, a, na;
    
    
    const double tol = 0.1;
    const int max_iter = 1000000;
    int *d_col, *d_row;
    double *d_val, *d_x, dot;
    double *d_r, *d_p, *d_Ax;
    int k;
    
    cublasHandle_t cublasHandle = 0;
    cublasStatus_t cublasStatus;
    cublasStatus = cublasCreate(&cublasHandle);

    checkCudaErrors(cublasStatus);

    /* Get handle to the CUSPARSE context */
    cusparseHandle_t cusparseHandle = 0;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    checkCudaErrors(cusparseStatus);

    cusparseMatDescr_t descr = 0;
    cusparseStatus = cusparseCreateMatDescr(&descr);

    checkCudaErrors(cusparseStatus);

    cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);

    checkCudaErrors(cudaMalloc((void **)&d_col, G.nonzero*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_row, (m+n+1)*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&d_val, G.nonzero*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_x, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_r, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_p, (m+n)*sizeof(double)));
    checkCudaErrors(cudaMalloc((void **)&d_Ax, (m+n)*sizeof(double)));

    cudaMemcpy(d_col, G.columns, G.nonzero*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_row, G.rowIndex, (m+n+1)*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_val, G.Mat_val, G.nonzero*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_x, G.x, (m+n)*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_r, G.b, (m+n)*sizeof(double), cudaMemcpyHostToDevice);

    alpha = 1.0;
    alpham1 = -1.0;
    beta = 0.0;
    r0 = 0.;

    printf("\n Data transferred\n");

    	cudaEvent_t start,stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	
	cudaEventRecord(start, 0);

    cusparseDcsrmv(cusparseHandle,CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax);

    cublasDaxpy(cublasHandle, (m+n), &alpham1, d_Ax, 1, d_r, 1);
    cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1);

    k = 1;

    while (r1 > tol && k <= max_iter)
    {
        if (k > 1)
        {
            b = r1 / r0;
            cublasStatus = cublasDscal(cublasHandle, (m+n), &b, d_p, 1);
            cublasStatus = cublasDaxpy(cublasHandle, (m+n), &alpha, d_r, 1, d_p, 1);
        }
        else
        {
            cublasStatus = cublasDcopy(cublasHandle, (m+n), d_r, 1, d_p, 1);
        }

        cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, (m+n), (m+n), G.nonzero, &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax);
        cublasStatus = cublasDdot(cublasHandle, (m+n), d_p, 1, d_Ax, 1, &dot);
        a = r1 / dot;

        cublasStatus = cublasDaxpy(cublasHandle, (m+n), &a, d_p, 1, d_x, 1);
        na = -a;
        cublasStatus = cublasDaxpy(cublasHandle, (m+n), &na, d_Ax, 1, d_r, 1);

        r0 = r1;
        cublasStatus = cublasDdot(cublasHandle, (m+n), d_r, 1, d_r, 1, &r1);
//        cudaThreadSynchronize();
//        printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
        k++;
    }
    
    	cudaEventRecord(stop, 0);
	
	cudaEventSynchronize(stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("Iterations = %3d\tTime : %.6f milli-seconds : \n", k, elapsedTime);

    cudaMemcpy(G.x, d_x, (m+n)*sizeof(double), cudaMemcpyDeviceToHost);

/*        printf("\n x = \n");
	for(i=0;i<(m+n);i++)
	{
		printf("\n x[%d] = %.8f", i, G.x[i]);
	}	
*/    float rsum, diff, err = 0.0;

    for (int i = 0; i < (m+n); i++)
    {
        rsum = 0.0;

        for (int j = G.rowIndex[i]; j < G.rowIndex[i+1]; j++)
        {
            rsum += G.Mat_val[j]*G.x[G.columns[j]];
        }

        diff = fabs(rsum - G.b[i]);

        if (diff > err)
        {
            err = diff;
        }
    }

    cusparseDestroy(cusparseHandle);
    cublasDestroy(cublasHandle);

/*    free(I);
    free(J);
    free(val);
    free(x);
    free(rhs);
*/    cudaFree(d_col);
    cudaFree(d_row);
    cudaFree(d_val);
    cudaFree(d_x);
    cudaFree(d_r);
    cudaFree(d_p);
    cudaFree(d_Ax);

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    
/*    printf("\n X is:\n");
    for(i=0;i<(m+n);i++)
    {
    	printf("\n X[%d] = %.4f", i, G.x[i]);
    }
*/    	
    printf("Test Summary:  Error amount = %f\n", err);
    exit((k <= max_iter) ? 0 : 1);




/*	
	culaSparseHandle handle;
    	if (culaSparseCreate(&handle) != culaSparseNoError)
    	{
        	// this should only fail under extreme conditions
        	std::cout << "fatal error: failed to create library handle!" << std::endl;
        	exit(EXIT_FAILURE);
    	}
	
	StatusChecker sc(handle);

	culaSparsePlan plan;
	sc = culaSparseCreatePlan(handle, &plan);
	
	culaSparseCsrOptions formatOpts;
	culaSparseCsrOptionsInit(handle, &formatOpts);
	//formatOpts.indexing=1;	
	sc = culaSparseSetDcsrData(handle, plan, &formatOpts, m+n, G.nonzero, &G.Mat_val[0], &G.rowIndex[0], &G.columns[0], &G.x[0], &G.b[0]);
	printf("\n Fine till here");
	printf("\n");	
	culaSparseConfig config;
    	sc = culaSparseConfigInit(handle, &config);
    	config.relativeTolerance = 1e-2;
    	config.maxIterations = 100000;
	config.divergenceTolerance = 20;
	culaSparseResult result;
    	
/*    	sc = culaSparseSetHostPlatform(handle, plan, 0);

    // set cg solver
    	sc = culaSparseSetCgSolver(handle, plan, 0);
	printf("\n Fine till here");
	printf("\n");    
    // perform solve (cg + no preconditioner on host)
	culaSparseResult result;
	sc = culaSparseExecutePlan(handle, plan, &config, &result);
	sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
	std::cout << buffer << std::endl;
    	
    	if (culaSparsePreinitializeCuda(handle) == culaSparseNoError)
    {
        // change to cuda accelerated platform
        sc = culaSparseSetCudaPlatform(handle, plan, 0);

        // perform solve (cg + ilu0 on cuda)
        culaSparseGmresOptions solverOpts;
	culaSparseStatus status = culaSparseGmresOptionsInit(handle, &solverOpts);
	solverOpts.restart = 20;


	sc = culaSparseSetCgSolver(handle, plan, 0);

//        sc = culaSparseSetJacobiPreconditioner(handle, plan, 0);
//        sc = culaSparseSetIlu0Preconditioner(handle, plan, 0);
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;

        // change preconditioner to fainv 
        // this avoids data transfer by using data cached by the plan
//        sc = culaSparseSetIlu0Preconditioner(handle, plan, 0);

        // perform solve (cg + fainv on cuda)
        // these timing results should indicate minimal overhead
/*        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;

        // change solver
        // this avoids preconditioner regeneration by using data cached by the plan
        sc = culaSparseSetBicgstabSolver(handle, plan, 0);

        // perform solve (bicgstab + fainv on cuda)
        // the timing results should indicate minimal overhead and preconditioner generation time
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;
        
        sc = culaSparseSetGmresSolver(handle, plan, 0);

        // perform solve (bicgstab + fainv on cuda)
        // the timing results should indicate minimal overhead and preconditioner generation time
        sc = culaSparseExecutePlan(handle, plan, &config, &result);
        sc = culaSparseGetResultString(handle, &result, buffer, bufsize);
        std::cout << buffer << std::endl;
    }
    else
    {
        std::cout << "alert: no cuda capable gpu found" << std::endl;
    }

    // cleanup plan
    culaSparseDestroyPlan(plan);

    // cleanup handle
    culaSparseDestroy(handle);

    	FILE* myWriteFile;
  	myWriteFile=fopen("result.txt","w");
  	for (i = 0; i < n; i++)
    	{
		fprintf(myWriteFile,"%1f\n",G.x[i]);
    		//  printf ("\n x [%d] = % f", i, x[i]);
    	}
  	fprintf(myWriteFile,".end\n");
  	fclose(myWriteFile);
 
  	printf ("\n");
    	
    	
//	time_st=dsecnd();
//	solver(G.rowIndex,G.columns,G.Mat_val,G.b,G.x,m+n,G.nonzero);
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	printf("Successfully Solved in : %.6f secs\n",time_avg);
	cerr<<endl;

	cerr<<"Fillup Graph                    ";	
//	time_st=dsecnd();
	G.fillup_graph();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;

	//G.output_graph_stdout();
	cerr<<"Matching KCL                    ";
//	time_st=dsecnd();
	G.check_kcl();
//	time_end=dsecnd();
//	time_avg = (time_end-time_st);
//	cerr<<"Done                 "<<time_avg<<endl;
	/*for (int i=0;i<m+n;i++)
	  {
	  cout<<"M"<<i<<endl;
	  for (int j=0;j<m+n;j++)
	  cout<<" "<<j<<"#"<<M[i][j]<<endl;
	  }*/
} 
Ejemplo n.º 9
0
		cublasStatus_t cublasXcopy(int n, const double* x, int incx, double* y, int incy) {
			return cublasDcopy(g_context->cublasHandle, n, x, incx, y, incy);
		}
Ejemplo n.º 10
0
void caffe_gpu_copy<double>(const int N, const double* X, double* Y) {
  CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), N, X, 1, Y, 1));
}
void copy(const Vector<double> &x, Vector<double> &y)
{
  assert(x.getSize() == y.getSize()); 
  cublasDcopy(x.getSize(), x, x.inc(), y, y.inc());
}
Ejemplo n.º 12
0
//
// Overloaded function for dispatching to
// * CUBLAS backend, and
// * double value-type.
//
inline void copy( const int n, const double* x, const int incx, double* y,
        const int incy ) {
    cublasDcopy( n, x, incx, y, incy );
}
Ejemplo n.º 13
0
int CORE_dtstrf_cublas(int M, int N, int IB, int NB,
                double *U, int LDU,
                double *A, int LDA,
                double *L, int LDL,
                int *IPIV,
                double *WORK, int LDWORK,
                int *INFO)
{
  static double zzero = 0.0;
  static double mzone =-1.0;
  cublasStatus_t status;
  cudaError_t err;
  
  double alpha;
  int i, j, ii, sb;
  int im, ip;
  
#if CONFIG_VERBOSE
  fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n",
          __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK);
  fflush(stdout);
#endif
  
  /* Check input arguments */
  *INFO = 0;
  if (M < 0) {
    coreblas_error(1, "Illegal value of M");
    return -1;
  }
  if (N < 0) {
    coreblas_error(2, "Illegal value of N");
    return -2;
  }
  if (IB < 0) {
    coreblas_error(3, "Illegal value of IB");
    return -3;
  }
  if ((LDU < max(1,NB)) && (NB > 0)) {
    coreblas_error(6, "Illegal value of LDU");
    return -6;
  }
  if ((LDA < max(1,M)) && (M > 0)) {
    coreblas_error(8, "Illegal value of LDA");
    return -8;
  }
  if ((LDL < max(1,IB)) && (IB > 0)) {
    coreblas_error(10, "Illegal value of LDL");
    return -10;
  }
  
  /* Quick return */
  if ((M == 0) || (N == 0) || (IB == 0))
    return PLASMA_SUCCESS;
  
  /* Set L to 0 */
  err = cudaMemset(L, 0, LDL*N*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  double* dev_ptr = 0;
  err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  double* host_ptr;
  err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV);
  
  ip = 0;
  for (ii = 0; ii < N; ii += IB) {
    sb = min(N-ii, IB);
    
    for (i = 0; i < sb; i++) {
      status = cublasIdamax(kaapi_cuda_cublas_handle(),
                            M, &A[LDA*(ii+i)], 1, &im
                            );
      PLASMA_CUBLAS_ASSERT(status);
      
      /* get im */
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      /* ajust index, CUBLAS is 1-based indexing */
      im--;

      piv[ip] = ii+i+1;
      
      core_dtstrf_cmp(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if (host_ptr[0] == 1.0f) {
        /*
         * Swap behind.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
                   i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK
        );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Swap ahead.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
              sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA
         );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Set IPIV.
         */
        piv[ip] = NB + im + 1;

        core_dtstrf_set_zero(kaapi_cuda_kernel_stream(),
                             A, LDA, i, ii, im, zzero
                        );
      }
      
      core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if ((*INFO == 0) && (host_ptr[0] == 1.0f)) {
        *INFO = ii+i+1;
      }
      
//      alpha = ((double)1. / U[LDU*(ii+i)+ii+i]);
      alpha = host_ptr[1];
      status = cublasDscal(kaapi_cuda_cublas_handle(),
                           M, &alpha, &A[LDA*(ii+i)], 1
                           );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDcopy(kaapi_cuda_cublas_handle(),
                  M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1
        );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDger(kaapi_cuda_cublas_handle(),
                          M, sb-i-1,
                          &mzone, &A[LDA*(ii+i)], 1,
                          &U[LDU*(ii+i+1)+ii+i], LDU,
                          &A[LDA*(ii+i+1)], LDA
      );
      PLASMA_CUBLAS_ASSERT(status);
      ip = ip+1;
    }
    /*
     * Apply the subpanel to the rest of the panel.
     */
    if(ii+i < N) {
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] - ii;
        }
      }
      
      CORE_dssssm_cublas_v2(
                  NB, N-(ii+sb), M, N-(ii+sb), sb, sb,
                  &U[LDU*(ii+sb)+ii], LDU,
                  &A[LDA*(ii+sb)], LDA,
                  &L[LDL*ii], LDL,
                  WORK, LDWORK, &piv[ii]
                  );
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] + ii;
        }
      }
    }
  }
  
  cudaFreeHost(host_ptr);
  cudaFree(dev_ptr);
  return PLASMA_SUCCESS;
}