Exemplo n.º 1
0
extern "C" void
magma_finalize( magma_context *cntxt)
{
/*  -- MAGMA (version 1.6.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    This function finalizes the MAGMA hardware context.

    Arguments
    =========
    CNTXT  (input) MAGMA_CONTEXT
           Pointer to the MAGMA hardware context to be closed
    ===================================================================== */

  if (cntxt->num_cores > 1)
    /* Shut down the QUARK scheduler */
    QUARK_Delete(cntxt->quark);

  if (cntxt->num_gpus == 1)
    {
      /* Shutdown CUDA and CUBLAS*/
      cuCtxDetach( cntxt->gpu_context[0] );
      cublasShutdown();

      free(cntxt->gpu_context);
    }

  free(cntxt);
}
Exemplo n.º 2
0
CAMLprim value spoc_cublasShutdown(){
	cublasStatus cublas_error= CUBLAS_STATUS_SUCCESS;
	CUBLAS_CHECK_CALL(cublasShutdown());
	return Val_unit;
}
Exemplo n.º 3
0
/* Main */
int test_cublas(void)
{    
    cublasStatus status;
    cudaError_t e;
    float* h_A;
    float* h_B;
    float* h_C;
    float* h_C_ref;
    float* d_A = 0;
    void *vp;
    float* d_B = 0;
    float* d_C = 0;
    float alpha = 1.0f;
    float beta = 0.0f;
    int n2 = N * N;
    int i;
    float error_norm;
    float ref_norm;
    float diff;

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    /* Allocate host memory for the matrices */
    h_A = (float*)malloc(n2 * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (float*)malloc(n2 * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Fill the matrices with test data */
    for (i = 0; i < n2; i++) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = rand() / (float)RAND_MAX;
    }

    /* Allocate device memory for the matrices */
    if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    d_A = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    d_B = (float *) vp;

    if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
    d_C = (float *) vp;

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }
    
    /* Performs operation using plain C code */
    simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;

    /* Clear last error */
    cublasGetError();

    /* Performs operation using cublas */
    cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);
    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    
    /* Allocate host memory for reading back the result from device memory */
    h_C = (float*)malloc(n2 * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Read the result back */
    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (i = 0; i < n2; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_ref);
    e = cudaFree(d_A);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_B);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    e = cudaFree(d_C);
    if (e != cudaSuccess) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
Exemplo n.º 4
0
SEXP magmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri)
{
	magma_init();
	int ndevices;
	double *h_R;
	
	ndevices = INTEGER_VALUE(ngpu);
        int idevice;
        for(idevice=0; idevice < ndevices; idevice++)
        {
                magma_setdevice(idevice);
                if(CUBLAS_STATUS_SUCCESS != cublasInit())
                {
                        printf("Error: gpu %d: cublasInit failed\n", idevice);
                        magma_finalize();
                        exit(-1);
                }
        }
//	magma_print_devices();
	
	int In, INB;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	double *PA = NUMERIC_POINTER(A);
	int i,j;

	//magma_timestr_t start, end;
	double gpu_time;
	printf("Inside magma_dpotrf_m");
	/*for(i = 0; i < 5; i++)
	{
		for(j = 0; j < 5; j++)
		{
			printf("%.8f ", PA[i+j*In]);
		}
		printf("\n");
	}	*/
	magma_int_t N, status, info, nGPU, n2, lda;
	clock_t t1, t2;
	N = In;
	status = 0;
	int nGPUs = ndevices;

        lda = N;
        n2 = lda*N;

	if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) {      
        fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); 
        magma_finalize();                                                  
        exit(-1);   
     	}

	lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda );
	//printf("Modified by Vinay in 2 GPU\n");
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri){
		t1 = clock();
		magma_dpotrf_m(nGPUs, MagmaLower, N, h_R, N, &info);
		t2 = clock ();
	}
	else{
		t1 = clock();
		magma_dpotrf_m(nGPUs, MagmaUpper, N, h_R, N, &info);
		t2 = clock ();
	}
	gpu_time = (double) (t2-t1)/(CLOCKS_PER_SEC) ; // Magma time
	printf (" magma_dpotrf_m time : %f sec. \n", gpu_time );
	if(info != 0)
	{
		printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
	
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda );
	magma_free_pinned(h_R);
	magma_finalize();
	cublasShutdown();

	int IZeroTri;
        IZeroTri = INTEGER_VALUE(zeroTri);
	if(IZeroTri & lTri) {
		for(i = 1; i < In; i++)
        	{
       			for(j=0; j< i; j++)
                	{
                       		PA[i*In+j] = 0.0;
                	}
        	}
	}
	else if(IZeroTri){
		for(i = 0; i < In; i++)
                {
                        for(j=i+1; j < In; j++)
                        {
                                PA[i*In+j] = 0.0;
                        }
                }
	}
	return(R_NilValue);
}
Exemplo n.º 5
0
int main(int argc, char **argv)
{
    #define test_A(i,j) test_A[(size_t)(j)*N+(i)]
    #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)]
    int N,NB,w,LDA,BB;
    size_t memsize; //bytes
    int iam, nprocs, mydevice;
    int ICTXT, nprow, npcol, myprow, mypcol;
    int i_one = 1, i_zero = 0, i_negone = -1;
    double d_one = 1.0, d_zero = 0.0, d_negone = -1.0;
    int IASEED = 100;
/*  printf("N=?\n");
    scanf("%ld",&N);
    printf("NB=?\n");
    scanf("%d", &NB);
    printf("width of Y panel=?\n");
    scanf("%ld",&w);
*/
    if(argc < 4){
        printf("invalid arguments N NB memsize(M)\n");
        exit(1);
    }
    N = atoi(argv[1]);
    NB = atoi(argv[2]);
    memsize = (size_t)atoi(argv[3])*1024*1024;
    BB = (N + NB - 1) / NB;
    w = memsize/sizeof(double)/BB/NB/NB - 1;
    assert(w > 0);
    LDA = N + 0; //padding

    int do_io = (N <= NSIZE);
    double llttime;
    double gflops;
    
    nprow = npcol = 1;
    blacs_pinfo_(&iam, &nprocs);
    blacs_get_(&i_negone, &i_zero, &ICTXT);
    blacs_gridinit_(&ICTXT, "R", &nprow, &npcol);
    blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol);
    #ifdef USE_MIC
        #ifdef __INTEL_OFFLOAD
            printf("offload compilation enabled\ninitialize each MIC\n");
            offload_init(&iam, &mydevice);
            #pragma offload target(mic:0)
            {
                mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE);
            }
        #else
            if(isroot)
                printf("offload compilation not enabled\n");
            exit(0);
        #endif
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){
                cuStatus = cublasCreate(&worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasInit();
        #endif
    #endif

    double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol
#ifdef VERIFY
    double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify
#endif
    
    /*Initialize A */
    int i,j;
    printf("Initialize A ... "); fflush(stdout);
    llttime = MPI_Wtime();
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A, &LDA, &i_zero, &i_zero,
         &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol); 
    llttime = MPI_Wtime() - llttime;
    printf("time %lf\n", llttime);
              
    /*print test_A*/
    if(do_io){
        printf("Original A=\n\n");
        matprint(test_A, N, LDA, 'A');
    }

    /*Use directed unblocked Cholesky factorization*/    
    /*
    t1 = clock();
    Test_dpotrf(test_A2,N);
    t2 = clock();
    printf ("time for unblocked Cholesky factorization on host %f \n",
        ((float) (t2 - t1)) / CLOCKS_PER_SEC);
    */
    
    /*print test_A*/
    /*
    if(do_io){
        printf("Unblocked result:\n\n");
        matprint(test_A2,N,'L');   
    }
    */ 

    /*Use tile algorithm*/
    Quark *quark = QUARK_New(OOC_NTHREADS);
    QUARK_DOT_DAG_Enable(quark, 0);
    #ifdef USE_MIC
//      mklmem(NB);
        printf("QUARK MIC affinity binding\n");
        QUARK_bind(quark);
        printf("offload warm up\n");
        warmup(quark);
    #endif
    QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0));
    printf("LLT start %lf\n", MPI_Wtime());
    llttime = Cholesky(quark,test_A,N,NB,LDA,memsize);
    printf("LLT end %lf\n", MPI_Wtime());
    QUARK_Delete(quark);
    #ifdef USE_MIC
        offload_destroy();
    #else
        #ifdef USE_CUBLASV2
        {
            cublasStatus_t cuStatus;
            for(int r = 0; r < OOC_NTHREADS; r++){ 
                cuStatus = cublasDestroy(worker_handle[r]);
                assert(cuStatus == CUBLAS_STATUS_SUCCESS);
            }
        }
        #else
            cublasShutdown();
        #endif
    #endif

    gflops = (double) N;
    gflops = gflops/3.0 + 0.5;
    gflops = gflops*(double)(N)*(double)(N);
    gflops = gflops/llttime/1024.0/1024.0/1024.0;
    printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n",
        N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops);
    #ifdef USE_MIC
        #pragma offload target(mic:0)
        {
            memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET);
        }
        printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0);
    #endif

    /*Update and print L*/             
    if(do_io){
        printf("L:\n\n");
        matprint(test_A,N,LDA,'L');
    }
#ifdef VERIFY
    printf("Verify... ");
    llttime = MPI_Wtime();
  /*
   * ------------------------
   * check difference betwen 
   * test_A and test_A2
   * ------------------------
   */
    /*
    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = j; i < N; i++)
          {
            double err = (test_A (i, j) - test_A2 (i, j));
            err = ABS (err);
            maxerr = MAX (err, maxerr);
            maxerr2 = maxerr2 + err * err;
          };
      };
    maxerr2 = sqrt (ABS (maxerr2));
    printf ("max difference between test_A and test_A2 %lf \n", maxerr);
    printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2);
    };
    */

  /*
   * ------------------
   * over-write test_A2
   * ------------------
   */
   
    pdmatgen(&ICTXT, "Symm", "Diag", &N,
         &N, &NB, &NB,
         test_A2, &LDA, &i_zero,
         &i_zero, &IASEED, &i_zero, &N, &i_zero, &N,
         &myprow, &mypcol, &nprow, &npcol);

  /*
   * ---------------------------------------
   * after solve, test_A2 should be identity
   * ---------------------------------------
   */
  // test_A = chol(B) = L;
  // test_A2 = B
  // solve L*L'*X = B
  // if L is correct, X is identity */
     
    {
    int uplo = 'L';
    const char *uplo_char = ((uplo == (int) 'U')
                    || (uplo == (int) 'u')) ? "U" : "L";
    int info = 0;
    int nrhs = N;
    int LDA = N;
    int ldb = N;
    dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info);
    assert (info == 0);
    }

    {
    double maxerr = 0;
    double maxerr2 = 0;

    for (j = 0; j < N; j++)
      {
        for (i = 0; i < N; i++)
          {
            double eyeij = (i == j) ? 1.0 : 0.0;
            double err = (test_A2 (i, j) - eyeij);
            err = ABS (err);
            maxerr = MAX (maxerr, err);
            maxerr2 = maxerr2 + err * err;
          };
      };

    maxerr2 = sqrt (ABS (maxerr2));
    printf("time %lf\n", MPI_Wtime() - llttime);
    printf ("max error %lf \n", maxerr);
    printf ("max L2 error %lf \n", maxerr2);
    }
#endif

    free(test_A);test_A=NULL;
#ifdef VERIFY
    free(test_A2);test_A2=NULL;
#endif
    blacs_gridexit_(&ICTXT);
    blacs_exit_(&i_zero);
    return 0;
    #undef test_A
    #undef test_A2
}
Exemplo n.º 6
0
	~MagmaSpectralSolver() {
		cublasShutdown();
	}
Exemplo n.º 7
0
/*@C
   PetscFinalize - Checks for options to be called at the conclusion
   of the program. MPI_Finalize() is called only if the user had not
   called MPI_Init() before calling PetscInitialize().

   Collective on PETSC_COMM_WORLD

   Options Database Keys:
+  -options_table - Calls PetscOptionsView()
.  -options_left - Prints unused options that remain in the database
.  -objects_dump [all] - Prints list of objects allocated by the user that have not been freed, the option all cause all outstanding objects to be listed
.  -mpidump - Calls PetscMPIDump()
.  -malloc_dump - Calls PetscMallocDump()
.  -malloc_info - Prints total memory usage
-  -malloc_log - Prints summary of memory usage

   Level: beginner

   Note:
   See PetscInitialize() for more general runtime options.

.seealso: PetscInitialize(), PetscOptionsView(), PetscMallocDump(), PetscMPIDump(), PetscEnd()
@*/
PetscErrorCode  PetscFinalize(void)
{
  PetscErrorCode ierr;
  PetscMPIInt    rank;
  PetscInt       nopt;
  PetscBool      flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE;
#if defined(PETSC_HAVE_AMS)
  PetscBool      flg = PETSC_FALSE;
#endif
#if defined(PETSC_USE_LOG)
  char           mname[PETSC_MAX_PATH_LEN];
#endif

  PetscFunctionBegin;
  if (!PetscInitializeCalled) {
    printf("PetscInitialize() must be called before PetscFinalize()\n");
    PetscFunctionReturn(PETSC_ERR_ARG_WRONGSTATE);
  }
  ierr = PetscInfo(NULL,"PetscFinalize() called\n");CHKERRQ(ierr);

#if defined(PETSC_SERIALIZE_FUNCTIONS)
  ierr = PetscFPTDestroy();CHKERRQ(ierr);
#endif


#if defined(PETSC_HAVE_AMS)
  ierr = PetscOptionsGetBool(NULL,"-options_gui",&flg,NULL);CHKERRQ(ierr);
  if (flg) {
    ierr = PetscOptionsAMSDestroy();CHKERRQ(ierr);
  }
#endif

#if defined(PETSC_HAVE_SERVER)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-server",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    /*  this is a crude hack, but better than nothing */
    ierr = PetscPOpen(PETSC_COMM_WORLD,NULL,"pkill -9 petscwebserver","r",NULL);CHKERRQ(ierr);
  }
#endif

  ierr = PetscHMPIFinalize();CHKERRQ(ierr);

  ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg2,NULL);CHKERRQ(ierr);
  if (!flg2) {
    flg2 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-memory_info",&flg2,NULL);CHKERRQ(ierr);
  }
  if (flg2) {
    ierr = PetscMemoryShowUsage(PETSC_VIEWER_STDOUT_WORLD,"Summary of Memory Usage in PETSc\n");CHKERRQ(ierr);
  }

#if defined(PETSC_USE_LOG)
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-get_total_flops",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    PetscLogDouble flops = 0;
    ierr = MPI_Reduce(&petsc_TotalFlops,&flops,1,MPI_DOUBLE,MPI_SUM,0,PETSC_COMM_WORLD);CHKERRQ(ierr);
    ierr = PetscPrintf(PETSC_COMM_WORLD,"Total flops over all processors %g\n",flops);CHKERRQ(ierr);
  }
#endif


#if defined(PETSC_USE_LOG)
#if defined(PETSC_HAVE_MPE)
  mname[0] = 0;

  ierr = PetscOptionsGetString(NULL,"-log_mpe",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0]) {ierr = PetscLogMPEDump(mname);CHKERRQ(ierr);}
    else          {ierr = PetscLogMPEDump(0);CHKERRQ(ierr);}
  }
#endif
  mname[0] = 0;

  ierr = PetscOptionsGetString(NULL,"-log_summary",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    PetscViewer viewer;
    if (mname[0]) {
      ierr = PetscViewerASCIIOpen(PETSC_COMM_WORLD,mname,&viewer);CHKERRQ(ierr);
      ierr = PetscLogView(viewer);CHKERRQ(ierr);
      ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
    } else {
      viewer = PETSC_VIEWER_STDOUT_WORLD;
      ierr   = PetscLogView(viewer);CHKERRQ(ierr);
    }
  }

  mname[0] = 0;

  ierr = PetscOptionsGetString(NULL,"-log_summary_python",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    PetscViewer viewer;
    if (mname[0]) {
      ierr = PetscViewerASCIIOpen(PETSC_COMM_WORLD,mname,&viewer);CHKERRQ(ierr);
      ierr = PetscLogViewPython(viewer);CHKERRQ(ierr);
      ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
    } else {
      viewer = PETSC_VIEWER_STDOUT_WORLD;
      ierr   = PetscLogViewPython(viewer);CHKERRQ(ierr);
    }
  }

  ierr = PetscOptionsGetString(NULL,"-log_detailed",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  if (flg1) {
    if (mname[0])  {ierr = PetscLogPrintDetailed(PETSC_COMM_WORLD,mname);CHKERRQ(ierr);}
    else           {ierr = PetscLogPrintDetailed(PETSC_COMM_WORLD,0);CHKERRQ(ierr);}
  }

  mname[0] = 0;

  ierr = PetscOptionsGetString(NULL,"-log_all",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetString(NULL,"-log",mname,PETSC_MAX_PATH_LEN,&flg2);CHKERRQ(ierr);
  if (flg1 || flg2) {
    if (mname[0]) PetscLogDump(mname);
    else          PetscLogDump(0);
  }
#endif

  /*
     Free all objects registered with PetscObjectRegisterDestroy() such as PETSC_VIEWER_XXX_().
  */
  ierr = PetscObjectRegisterDestroyAll();CHKERRQ(ierr);

  ierr = PetscStackDestroy();CHKERRQ(ierr);

  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr);
  if (!flg1) { ierr = PetscPopSignalHandler();CHKERRQ(ierr);}
  flg1 = PETSC_FALSE;
  ierr = PetscOptionsGetBool(NULL,"-mpidump",&flg1,NULL);CHKERRQ(ierr);
  if (flg1) {
    ierr = PetscMPIDump(stdout);CHKERRQ(ierr);
  }
  flg1 = PETSC_FALSE;
  flg2 = PETSC_FALSE;
  /* preemptive call to avoid listing this option in options table as unused */
  ierr = PetscOptionsHasName(NULL,"-malloc_dump",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsGetBool(NULL,"-options_table",&flg2,NULL);CHKERRQ(ierr);

  if (flg2) {
    PetscViewer viewer;
    ierr = PetscViewerASCIIGetStdout(PETSC_COMM_WORLD,&viewer);CHKERRQ(ierr);
    ierr = PetscOptionsView(viewer);CHKERRQ(ierr);
    ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
  }

  /* to prevent PETSc -options_left from warning */
  ierr = PetscOptionsHasName(NULL,"-nox",&flg1);CHKERRQ(ierr);
  ierr = PetscOptionsHasName(NULL,"-nox_warning",&flg1);CHKERRQ(ierr);

  if (!PetscHMPIWorker) { /* worker processes skip this because they do not usually process options */
    flg3 = PETSC_FALSE; /* default value is required */
    ierr = PetscOptionsGetBool(NULL,"-options_left",&flg3,&flg1);CHKERRQ(ierr);
    ierr = PetscOptionsAllUsed(&nopt);CHKERRQ(ierr);
    if (flg3) {
      if (!flg2) { /* have not yet printed the options */
        PetscViewer viewer;
        ierr = PetscViewerASCIIGetStdout(PETSC_COMM_WORLD,&viewer);CHKERRQ(ierr);
        ierr = PetscOptionsView(viewer);CHKERRQ(ierr);
        ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr);
      }
      if (!nopt) {
        ierr = PetscPrintf(PETSC_COMM_WORLD,"There are no unused options.\n");CHKERRQ(ierr);
      } else if (nopt == 1) {
        ierr = PetscPrintf(PETSC_COMM_WORLD,"There is one unused database option. It is:\n");CHKERRQ(ierr);
      } else {
        ierr = PetscPrintf(PETSC_COMM_WORLD,"There are %D unused database options. They are:\n",nopt);CHKERRQ(ierr);
      }
    }
#if defined(PETSC_USE_DEBUG)
    if (nopt && !flg3 && !flg1) {
      ierr = PetscPrintf(PETSC_COMM_WORLD,"WARNING! There are options you set that were not used!\n");CHKERRQ(ierr);
      ierr = PetscPrintf(PETSC_COMM_WORLD,"WARNING! could be spelling mistake, etc!\n");CHKERRQ(ierr);
      ierr = PetscOptionsLeft();CHKERRQ(ierr);
    } else if (nopt && flg3) {
#else
    if (nopt && flg3) {
#endif
      ierr = PetscOptionsLeft();CHKERRQ(ierr);
    }
  }

  {
    PetscThreadComm tcomm_world;
    ierr = PetscGetThreadCommWorld(&tcomm_world);CHKERRQ(ierr);
    /* Free global thread communicator */
    ierr = PetscThreadCommDestroy(&tcomm_world);CHKERRQ(ierr);
  }

  /*
       List all objects the user may have forgot to free
  */
  ierr = PetscOptionsHasName(NULL,"-objects_dump",&flg1);CHKERRQ(ierr);
  if (flg1) {
    MPI_Comm local_comm;
    char     string[64];

    ierr = PetscOptionsGetString(NULL,"-objects_dump",string,64,NULL);CHKERRQ(ierr);
    ierr = MPI_Comm_dup(MPI_COMM_WORLD,&local_comm);CHKERRQ(ierr);
    ierr = PetscSequentialPhaseBegin_Private(local_comm,1);CHKERRQ(ierr);
    ierr = PetscObjectsDump(stdout,(string[0] == 'a') ? PETSC_TRUE : PETSC_FALSE);CHKERRQ(ierr);
    ierr = PetscSequentialPhaseEnd_Private(local_comm,1);CHKERRQ(ierr);
    ierr = MPI_Comm_free(&local_comm);CHKERRQ(ierr);
  }
  PetscObjectsCounts    = 0;
  PetscObjectsMaxCounts = 0;

  ierr = PetscFree(PetscObjects);CHKERRQ(ierr);

#if defined(PETSC_USE_LOG)
  ierr = PetscLogDestroy();CHKERRQ(ierr);
#endif

  /*
     Destroy any packages that registered a finalize
  */
  ierr = PetscRegisterFinalizeAll();CHKERRQ(ierr);

  /*
     Destroy all the function registration lists created
  */
  ierr = PetscFinalize_DynamicLibraries();CHKERRQ(ierr);

  /*
     Print PetscFunctionLists that have not been properly freed

  ierr = PetscFunctionListPrintAll();CHKERRQ(ierr);
  */

  if (petsc_history) {
    ierr = PetscCloseHistoryFile(&petsc_history);CHKERRQ(ierr);
    petsc_history = 0;
  }

  ierr = PetscInfoAllow(PETSC_FALSE,NULL);CHKERRQ(ierr);

  {
    char fname[PETSC_MAX_PATH_LEN];
    FILE *fd;
    int  err;

    fname[0] = 0;

    ierr = PetscOptionsGetString(NULL,"-malloc_dump",fname,250,&flg1);CHKERRQ(ierr);
    flg2 = PETSC_FALSE;
    ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg2,NULL);CHKERRQ(ierr);
#if defined(PETSC_USE_DEBUG)
    if (PETSC_RUNNING_ON_VALGRIND) flg2 = PETSC_FALSE;
#else
    flg2 = PETSC_FALSE;         /* Skip reporting for optimized builds regardless of -malloc_test */
#endif
    if (flg1 && fname[0]) {
      char sname[PETSC_MAX_PATH_LEN];

      sprintf(sname,"%s_%d",fname,rank);
      fd   = fopen(sname,"w"); if (!fd) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Cannot open log file: %s",sname);
      ierr = PetscMallocDump(fd);CHKERRQ(ierr);
      err  = fclose(fd);
      if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fclose() failed on file");
    } else if (flg1 || flg2) {
      MPI_Comm local_comm;

      ierr = MPI_Comm_dup(MPI_COMM_WORLD,&local_comm);CHKERRQ(ierr);
      ierr = PetscSequentialPhaseBegin_Private(local_comm,1);CHKERRQ(ierr);
      ierr = PetscMallocDump(stdout);CHKERRQ(ierr);
      ierr = PetscSequentialPhaseEnd_Private(local_comm,1);CHKERRQ(ierr);
      ierr = MPI_Comm_free(&local_comm);CHKERRQ(ierr);
    }
  }

  {
    char fname[PETSC_MAX_PATH_LEN];
    FILE *fd = NULL;

    fname[0] = 0;

    ierr = PetscOptionsGetString(NULL,"-malloc_log",fname,250,&flg1);CHKERRQ(ierr);
    ierr = PetscOptionsHasName(NULL,"-malloc_log_threshold",&flg2);CHKERRQ(ierr);
    if (flg1 && fname[0]) {
      int err;

      if (!rank) {
        fd = fopen(fname,"w");
        if (!fd) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Cannot open log file: %s",fname);
      }
      ierr = PetscMallocDumpLog(fd);CHKERRQ(ierr);
      if (fd) {
        err = fclose(fd);
        if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fclose() failed on file");
      }
    } else if (flg1 || flg2) {
      ierr = PetscMallocDumpLog(stdout);CHKERRQ(ierr);
    }
  }
  /* Can be destroyed only after all the options are used */
  ierr = PetscOptionsDestroy();CHKERRQ(ierr);

  PetscGlobalArgc = 0;
  PetscGlobalArgs = 0;

#if defined(PETSC_USE_REAL___FLOAT128)
  ierr = MPI_Type_free(&MPIU___FLOAT128);CHKERRQ(ierr);
#if defined(PETSC_HAVE_COMPLEX)
  ierr = MPI_Type_free(&MPIU___COMPLEX128);CHKERRQ(ierr);
#endif
  ierr = MPI_Op_free(&MPIU_MAX);CHKERRQ(ierr);
  ierr = MPI_Op_free(&MPIU_MIN);CHKERRQ(ierr);
#endif

#if defined(PETSC_HAVE_COMPLEX)
#if !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)
  ierr = MPI_Type_free(&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr);
  ierr = MPI_Type_free(&MPIU_C_COMPLEX);CHKERRQ(ierr);
#endif
#endif

#if (defined(PETSC_HAVE_COMPLEX) && !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)) || defined(PETSC_USE_REAL___FLOAT128)
  ierr = MPI_Op_free(&MPIU_SUM);CHKERRQ(ierr);
#endif

  ierr = MPI_Type_free(&MPIU_2SCALAR);CHKERRQ(ierr);
#if defined(PETSC_USE_64BIT_INDICES) || !defined(MPI_2INT)
  ierr = MPI_Type_free(&MPIU_2INT);CHKERRQ(ierr);
#endif
  ierr = MPI_Op_free(&PetscMaxSum_Op);CHKERRQ(ierr);
  ierr = MPI_Op_free(&PetscADMax_Op);CHKERRQ(ierr);
  ierr = MPI_Op_free(&PetscADMin_Op);CHKERRQ(ierr);

  /*
     Destroy any known inner MPI_Comm's and attributes pointing to them
     Note this will not destroy any new communicators the user has created.

     If all PETSc objects were not destroyed those left over objects will have hanging references to
     the MPI_Comms that were freed; but that is ok because those PETSc objects will never be used again
 */
  {
    PetscCommCounter *counter;
    PetscMPIInt      flg;
    MPI_Comm         icomm;
    union {MPI_Comm comm; void *ptr;} ucomm;
    ierr = MPI_Attr_get(PETSC_COMM_SELF,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRQ(ierr);
    if (flg) {
      icomm = ucomm.comm;
      ierr = MPI_Attr_get(icomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
      if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_CORRUPT,"Inner MPI_Comm does not have expected tag/name counter, problem with corrupted memory");

      ierr = MPI_Attr_delete(PETSC_COMM_SELF,Petsc_InnerComm_keyval);CHKERRQ(ierr);
      ierr = MPI_Attr_delete(icomm,Petsc_Counter_keyval);CHKERRQ(ierr);
      ierr = MPI_Comm_free(&icomm);CHKERRQ(ierr);
    }
    ierr = MPI_Attr_get(PETSC_COMM_WORLD,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRQ(ierr);
    if (flg) {
      icomm = ucomm.comm;
      ierr = MPI_Attr_get(icomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr);
      if (!flg) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_CORRUPT,"Inner MPI_Comm does not have expected tag/name counter, problem with corrupted memory");

      ierr = MPI_Attr_delete(PETSC_COMM_WORLD,Petsc_InnerComm_keyval);CHKERRQ(ierr);
      ierr = MPI_Attr_delete(icomm,Petsc_Counter_keyval);CHKERRQ(ierr);
      ierr = MPI_Comm_free(&icomm);CHKERRQ(ierr);
    }
  }

  ierr = MPI_Keyval_free(&Petsc_Counter_keyval);CHKERRQ(ierr);
  ierr = MPI_Keyval_free(&Petsc_InnerComm_keyval);CHKERRQ(ierr);
  ierr = MPI_Keyval_free(&Petsc_OuterComm_keyval);CHKERRQ(ierr);

#if defined(PETSC_HAVE_CUDA)
  {
    PetscInt p;
    for (p = 0; p < PetscGlobalSize; ++p) {
      if (p == PetscGlobalRank) cublasShutdown();
      ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
    }
  }
#endif

  if (PetscBeganMPI) {
#if defined(PETSC_HAVE_MPI_FINALIZED)
    PetscMPIInt flag;
    ierr = MPI_Finalized(&flag);CHKERRQ(ierr);
    if (flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"MPI_Finalize() has already been called, even though MPI_Init() was called by PetscInitialize()");
#endif
    ierr = MPI_Finalize();CHKERRQ(ierr);
  }
/*

     Note: In certain cases PETSC_COMM_WORLD is never MPI_Comm_free()ed because
   the communicator has some outstanding requests on it. Specifically if the
   flag PETSC_HAVE_BROKEN_REQUEST_FREE is set (for IBM MPI implementation). See
   src/vec/utils/vpscat.c. Due to this the memory allocated in PetscCommDuplicate()
   is never freed as it should be. Thus one may obtain messages of the form
   [ 1] 8 bytes PetscCommDuplicate() line 645 in src/sys/mpiu.c indicating the
   memory was not freed.

*/
  ierr = PetscMallocClear();CHKERRQ(ierr);

  PetscInitializeCalled = PETSC_FALSE;
  PetscFinalizeCalled   = PETSC_TRUE;
  PetscFunctionReturn(ierr);
}

#if defined(PETSC_MISSING_LAPACK_lsame_)
PETSC_EXTERN int lsame_(char *a,char *b)
{
  if (*a == *b) return 1;
  if (*a + 32 == *b) return 1;
  if (*a - 32 == *b) return 1;
  return 0;
}
#endif

#if defined(PETSC_MISSING_LAPACK_lsame)
PETSC_EXTERN int lsame(char *a,char *b)
{
  if (*a == *b) return 1;
  if (*a + 32 == *b) return 1;
  if (*a - 32 == *b) return 1;
  return 0;
}
Exemplo n.º 8
0
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri)
{
	magma_init();
//	magma_print_devices();
	
	double *h_R;
	int In, INB, ID;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	ID = INTEGER_VALUE(id);
	double *PA = NUMERIC_POINTER(A);
	int i,j;

	magma_int_t N, n2, lda, status, info, max_size;
	N=In;
   	lda = N;
   	n2 = lda*N;
  	

/*	for(i = 0; i < In; i++)
	{
		for(j = 0; j < In; j++)
		{
			printf("%.8f ", PA[i+j*In]);
		}
		printf("\n");
	}	*/

	if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) {      
        fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); 
        magma_finalize();                                                  
        exit(-1);   
     	}
        

        lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda );
	
	N = In;
	status = 0;
	magma_setdevice(ID);
	//printf("Modified by Vinay in one GPU\n");
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri)
		magma_dpotrf(MagmaLower, N, h_R, N, &info);
	else
		magma_dpotrf(MagmaUpper, N, h_R, N, &info);
	if(info != 0)
	{
		printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
		
	lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda );
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	magma_free_pinned(h_R);
	magma_finalize();
	cublasShutdown();
	/*
	int IZeroTri;
        IZeroTri = INTEGER_VALUE(zeroTri);
	if(IZeroTri & lTri) {
		for(i = 1; i < In; i++)
        	{
       			for(j=0; j< i; j++)
                	{
                       		PA[i*In+j] = 0.0;
                	}
        	}
	}
	else if(IZeroTri)
		for(i = 0; i < In; i++)
                {
                        for(j=i+1; j < In; j++)
                        {
                                PA[i*In+j] = 0.0;
                        }
                }*/
	return(R_NilValue);
}
Exemplo n.º 9
0
/* ========================================================================== */
int sci_gpuLU(char *fname)
{
    CheckRhs(1,2);
    CheckLhs(2,2);
    #ifdef WITH_CUDA
        cublasStatus status;
    #endif
    SciErr sciErr;
    int*    piAddr_A    = NULL;
    double* h_A         = NULL;
    double* hi_A        = NULL;
    int     rows_A;
    int     cols_A;

    int*    piAddr_Opt  = NULL;
    double* option      = NULL;
    int     rows_Opt;
    int     cols_Opt;

    void*   d_A         = NULL;
    int     na;
    void*   pvPtr       = NULL;

    int     size_A      = sizeof(double);
    bool    bComplex_A  = FALSE;
    int     inputType_A;
    int     inputType_Opt;
    double  res;
    int     posOutput   = 1;

    try
    {
        sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A);
        if(sciErr.iErr) throw sciErr;
        if(Rhs == 2)
        {
            sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt);
            if(sciErr.iErr) throw sciErr;
            sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt);
            if(sciErr.iErr) throw sciErr;
            if(inputType_Opt == sci_matrix)
            {
                sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option);
                if(sciErr.iErr) throw sciErr;
            }
            else
                throw "Option syntax is [number,number].";
        }
        else
        {
            rows_Opt=1;
            cols_Opt=2;
            option = (double*)malloc(2*sizeof(double));
            option[0]=0;
            option[1]=0;
        }

        if(rows_Opt != 1 || cols_Opt != 2)
            throw "Option syntax is [number,number].";

        if((int)option[1] == 1 && !isGpuInit())
            throw "gpu is not initialised. Please launch gpuInit() before use this function.";

        sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A);
        if(sciErr.iErr) throw sciErr;

        #ifdef WITH_CUDA
        if (useCuda())
        {
            if(inputType_A == sci_pointer)
            {
                sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr);
                if(sciErr.iErr) throw sciErr;

                gpuMat_CUDA* gmat;
                gmat = static_cast<gpuMat_CUDA*>(pvPtr);
				if(!gmat->useCuda)
					throw "Please switch to OpenCL mode before use this data.";
                rows_A=gmat->rows;
                cols_A=gmat->columns;
                if(gmat->complex)
                {
                    bComplex_A = TRUE;
                    size_A = sizeof(cuDoubleComplex);
                    d_A=(cuDoubleComplex*)gmat->ptr->get_ptr();
                }
                else
                    d_A=(double*)gmat->ptr->get_ptr();

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                na = rows_A * cols_A;
            }
            else if(inputType_A == 1)
            {
                // Get size and data
                if(isVarComplex(pvApiCtx, piAddr_A))
                {
                    sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A);
                    if(sciErr.iErr) throw sciErr;
                    size_A = sizeof(cuDoubleComplex);
                    bComplex_A = TRUE;
                }
                else
                {
                    sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A);
                    if(sciErr.iErr) throw sciErr;
                }

                na = rows_A * cols_A;

                // Initialize CUBLAS
                status = cublasInit();
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Allocate device memory
                status = cublasAlloc(na, size_A, (void**)&d_A);
                if (status != CUBLAS_STATUS_SUCCESS) throw status;

                // Initialize the device matrices with the host matrices
                if(!bComplex_A)
                {
                    status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A);
                    if (status != CUBLAS_STATUS_SUCCESS) throw status;
                }
                else
                    writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A);

            }
            else
                throw "Bad argument type.";

            cuDoubleComplex resComplex;
            // Performs operation
            if(!bComplex_A)
                status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1);
       //     else
       //         resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A);

            if (status != CUBLAS_STATUS_SUCCESS) throw status;

            // Put the result in scilab
            switch((int)option[0])
            {
                case 2 :
                case 1 :    sciprint("The first option must be 0 for this function. Considered as 0.\n");

                case 0 :    // Keep the result on the Host.
                {           // Put the result in scilab
                    if(!bComplex_A)
                    {
                        double* h_res = NULL;
                        sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res);
                        if(sciErr.iErr) throw sciErr;
                        status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                    }
                    else
                    {
                        sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y);
                        if(sciErr.iErr) throw sciErr;
                    }

                    LhsVar(posOutput)=Rhs+posOutput;
                    posOutput++;
                    break;
                }

                default : throw "First option argument must be 0 or 1 or 2.";
            }

            switch((int)option[1])
            {
                case 0 :    // Don't keep the data input on Device.
                {
                    if(inputType_A == sci_matrix)
                    {
                        status = cublasFree(d_A);
                        if (status != CUBLAS_STATUS_SUCCESS) throw status;
                        d_A = NULL;
                    }
                    break;
                }
                case 1 :    // Keep data of the fisrt argument on Device and return the Device pointer.
                {
                    if(inputType_A == sci_matrix)
                    {
                        gpuMat_CUDA* dptr;
                        gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
                        dptr=new gpuMat_CUDA(tmp);
						dptr->useCuda = true;
                        dptr->ptr->set_ptr((double*)d_A);
                        if(bComplex_A)
                            dptr->complex=TRUE;
                        else
                            dptr->complex=FALSE;

                        sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
                        if(sciErr.iErr) throw sciErr;
                        LhsVar(posOutput)=Rhs+posOutput;
                    }
                    else
                        throw "The first input argument is already a GPU variable.";

                    posOutput++;
                    break;
                }

                default : throw "Second option argument must be 0 or 1.";
            }
            // Shutdown
            status = cublasShutdown();
            if (status != CUBLAS_STATUS_SUCCESS) throw status;
        }
        #endif

        #ifdef WITH_OPENCL
        if (!useCuda())
        {
            throw "not implemented with OpenCL.";
        }
        #endif
        if(Rhs == 1)
        {
            free(option);
            option = NULL;
        }

        if(posOutput < Lhs+1)
            throw "Too many output arguments.";

        if(posOutput > Lhs+1)
            throw "Too few output arguments.";

        PutLhsVar();
        return 0;
    }
    catch(const char* str)
    {
        Scierror(999,"%s\n",str);
    }
    catch(SciErr E)
    {
        printError(&E, 0);
    }
    #ifdef WITH_CUDA
    catch(cudaError_t cudaE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
    }
    catch(cublasStatus CublasE)
    {
        GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
    }
    if (useCuda())
    {
        if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
    }
    #endif
    #ifdef WITH_OPENCL
    if (!useCuda())
    {
        Scierror(999,"not implemented with OpenCL.\n");
    }
    #endif
    if(Rhs == 1 && option != NULL) free(option);
    return EXIT_FAILURE;
}
Exemplo n.º 10
0
SEXP smagmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri)
{
	magma_init();
	int ndevices;
	ndevices = INTEGER_VALUE(ngpu);
        int idevice;
        for(idevice=0; idevice < ndevices; idevice++)
        {
                magma_setdevice(idevice);
                if(CUBLAS_STATUS_SUCCESS != cublasInit())
                {
                        printf("Error: gpu %d: cublasInit failed\n", idevice);
                        magma_finalize();
                        exit(-1);
                }
        }
//	magma_print_devices();
	
	int In, INB;
	In = INTEGER_VALUE(n);
	INB = INTEGER_VALUE(NB);
	double *PA = NUMERIC_POINTER(A);
	float *sPA = calloc(In*In, sizeof(float));
	int i,j;
	for(i = 0; i < In; i++)
        {
                for(j = 0; j < In; j++)
                {
                        sPA[i*In + j] = (float) PA[i*In + j];
                }
        }
	magma_int_t N, status, info, nGPUs;
	N = In;
	status = 0;
	nGPUs = ndevices;
	
	//INB = magma_get_dpotrf_nb(N);
//	INB = 224;
//	printf("INB = %d\n", INB);
	//ngpu = ndevices;
//	printf("ngpu = %d\n", ngpu);
	//max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB);
//	printf("max_size = %d\n", max_size);
	//int imax_size = max_size;
	//double *dA;
	//magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double));
	
	//ldda = (1+N/(INB*ndevices))*INB;
//	printf("ldda = %d\n", ldda);
	//magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB);
	//magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info);
	int lTri;
	lTri = INTEGER_VALUE(lowerTri);
	if(lTri)
		magma_spotrf_m(nGPUs, MagmaLower, N, sPA, N, &info);
	else
		magma_spotrf_m(nGPUs, MagmaUpper, N, sPA, N, &info);
	if(info != 0)
	{
		printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror(info));
	}
	
	//magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB);
	//for(dev = 0; dev < ndevices; dev++)
	//{
		//magma_setdevice(dev);
		//cudaFree(dA[dev]);
	//}
	magma_finalize();
	cublasShutdown();
	
	//caste sPA back to double and set upper or lower triangle to zero if necessary:
	int IZeroTri = INTEGER_VALUE(zeroTri);
        int zeroUTri = IZeroTri & lTri;
        int zeroLTri = IZeroTri & !lTri;
        if(!IZeroTri) {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                PA[i*In + j] = (double) sPA[i*In + j];
                        }
        }
        } else if(zeroUTri) {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                if(i > j)
                                        PA[i*In + j] = 0;
                                else
                                        PA[i*In + j] = (double) sPA[i*In + j];
                        }
                }
        } else {
                for(i = 1; i< In; i++) {
                        for(j=1; j < In; j++) {
                                if(i < j)
                                        PA[i*In + j] = 0;
                                else
                                        PA[i*In + j] = (double) sPA[i*In + j];
                        }
                }
        }
	
	UNPROTECT(1);
	free(sPA);
	return(R_NilValue);
}
Exemplo n.º 11
0
int
main( int argc,char** argv)
{
	printf("hello world\n");

	if (!InitCUDA())
	{
		return 0;
	}



	int iter = 1000;
	int trainnum = 20;
	bool isProfiler = false;
	int intProfiler = 0;
	int testnum = -1;
	float maxtime = 0.0f;
	cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler);
	cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum);
	cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime);
	printf("%d\n", intProfiler);
	if(intProfiler)
	{
		isProfiler = true;
	}
	if(testnum == -1) testnum = trainnum /2;
	printf("Iter = %d\n", iter);
	printf("TrainNum = %d\n", trainnum);
	printf("TestNum = %d\n", testnum);

	CUT_DEVICE_INIT(argc, argv);


	cublasStatus status;
	status = cublasInit();
	if(status != CUBLAS_STATUS_SUCCESS)
	{
		printf("Can't init cublas\n");
		printf("%s\n", cudaGetErrorString(cudaGetLastError()));
		return -1;
	}


	Image* imageList = new Image[trainnum+testnum];
	read64("my_optdigits.tra", imageList, trainnum + testnum);

	const int warmUpTime = 3;
	if(!isProfiler)
	{
		freopen("verbose.txt", "w", stdout);
		for(int i=0;i< warmUpTime;i++)
		{
			runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f);
		}
		freopen("CON", "w", stdout);
		printf("Warm-up complete.\n\n\n");
	}
#ifdef _DEBUG
	freopen("out.txt", "w", stdout);
#endif // _DEBUG
	runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime);
	freopen("CON", "w", stdout);
	delete[] imageList;
	//TestReduce();
	
	cublasShutdown();
	if(!isProfiler)
	{
		CUT_EXIT(argc, argv);
	}
	//getchar();
	return 0;
}
Exemplo n.º 12
0
inline void ShutdownTensorEngine( void ){
  cublasShutdown();
}
Exemplo n.º 13
0
Arquivo: xgemm.c Projeto: deccs/PLearn
/* Main */
int main(int argc, char** argv)
{    
  if (argc!=5){ 
    fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); 
    exit(0); 
  } 
  const int M=strtol(argv[1],0,0);
  const int N=strtol(argv[2],0,0);
  const int K=strtol(argv[3],0,0);
  const int NBITER=strtol(argv[4],0,0);
  const int NA= M * K;
  const int NB= K * N;
  const int NC= M * N;
  real* h_A;
  real* h_B;
  real* h_C;
  const real alpha = 1.0f;
  const real beta = 0.0f;
#ifdef NVIDIA
  cublasStatus status;
  real* d_A = 0;
  real* d_B = 0;
  real* d_C = 0;
#endif

#ifdef COMPARE
  real* h_C_ref;
  real error_norm;
  real ref_norm;
  real diff;
#endif

    /* Allocate host memory for the matrices */
    h_A = (real*)malloc(NA * sizeof(h_A[0]));
    if (h_A == 0) {
        fprintf (stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    h_B = (real*)malloc(NB * sizeof(h_B[0]));
    if (h_B == 0) {
        fprintf (stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i;
    for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i;

#ifdef NVIDIA
    /* Initialize CUBLAS */
    status = cublasInit();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }
    /* Allocate device memory for the matrices */
    status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    /* Initialize the device matrices with the host matrices */
    status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }
    status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    /* Clear last error */
    cublasGetError();
#endif
#ifdef COMPARE
    /* Performs operation using plain C code */
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
    h_C_ref = h_C;
    /* Allocate host memory for reading back the result from device memory */
    h_C = (real*)malloc(NC * sizeof(h_C[0]));
    if (h_C == 0) {
        fprintf (stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }
#endif
#ifdef NVIDIA
    /* Performs operation using cublas */
    for (int i=0;i<NBITER;i++)
      //We must Change the order of the parameter as cublas take
      //matrix as colomn major and C matrix is row major
      cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N);

    status = cublasGetError();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }
    /* Read the result back */
    status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }
#elif defined( CXGEMM )
    for (int i=0;i<NBITER;i++)
      c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C);
#else
    char transa='N', transb='N';
    for (int i=0;i<NBITER;i++)
      sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N);

#endif
#ifdef COMPARE
    /* Check result against reference */
    error_norm = 0;
    ref_norm = 0;
    for (int i = 0; i < NC; ++i) {
        diff = h_C_ref[i] - h_C[i];
        error_norm += diff * diff;
        ref_norm += h_C_ref[i] * h_C_ref[i];
    }
    error_norm = (float)sqrt((double)error_norm);
    ref_norm = (float)sqrt((double)ref_norm);
    if (fabs(ref_norm) < 1e-7) {
        fprintf (stderr, "!!!! reference norm is 0\n");
        return EXIT_FAILURE;
    }
    printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED");
#endif

    /* Memory clean up */
    free(h_A);
    free(h_B);
    free(h_C);

#ifdef NVIDIA
    status = cublasFree(d_A);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_B);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }
    status = cublasFree(d_C);
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    /* Shutdown */
    status = cublasShutdown();
    if (status != CUBLAS_STATUS_SUCCESS) {
        fprintf (stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }
#endif
    //    if (argc <= 1 || strcmp(argv[1], "-noprompt")) {
    //        printf("\nPress ENTER to exit...\n");
    //        getchar();
    //    }
    return EXIT_SUCCESS;
}
Exemplo n.º 14
0
int main(void)

{
    
    cublasStatus status;
    
    float* h_image;
    
    float* h_covariance;
    
    float* d_image;
    
    float* d_covariance;
    
    float alpha = 1.0f;
    
    float beta = 0.0f;
    
    int imgsize = N * L;
    
    //int i;
    
    FILE *fp1, *fp2;
    
    /* Initialize CUBLAS */
    
    status = cublasInit();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! CUBLAS initialization error\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Allocate host memory for the image */
    
    h_image = (float*)malloc(imgsize * sizeof(float));
    
    if (h_image == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    h_covariance = (float*)calloc(L * L, sizeof(float));
    
    if (h_covariance == 0) {
        
        fprintf (stderr, "!!!! host memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Fill the image with test data
     
     for (i = 0; i < imgsize; i++) {
     
     h_image[i] = rand() / (float)RAND_MAX;
     
     }*/
    
    fp1 = fopen("image.dat","rb");
    
    fread(h_image, sizeof(float), imgsize, fp1);
    
    printf("Valor de image[0]: %f\n", h_image[8]);
    
    /* Allocate device memory */
    
    status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device memory allocation error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Copy image to device memory */
    
    status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (write covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Clear last error */
    
    cublasGetError();
    
    /* Calculate covariance matrix using cublas */
    
    cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta,
                
                d_covariance, L);
    
    status = cublasGetError();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! kernel execution error.\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Read the result back */
    
    status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance,
                             
                             1);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! device access error (read covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fp2 = fopen("covariance.dat","wb");
    
    fwrite(h_covariance, sizeof(float), L*L, fp2);
    
    printf("Valor de covariance[8]: %f\n", h_covariance[8]);
    
    /* Memory clean up */
    
    free(h_image);
    
    free(h_covariance);
    
    status = cublasFree(d_image);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (image)\n");
        
        return EXIT_FAILURE;
        
    }
    
    status = cublasFree(d_covariance);
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! memory free error (covariance)\n");
        
        return EXIT_FAILURE;
        
    }
    
    /* Shutdown */
    
    status = cublasShutdown();
    
    if (status != CUBLAS_STATUS_SUCCESS) {
        
        fprintf (stderr, "!!!! shutdown error (A)\n");
        
        return EXIT_FAILURE;
        
    }
    
    fclose(fp1);
    
    fclose(fp2);
    
    printf("\nPress ENTER to exit...\n");
    
    getchar();
    
    return EXIT_SUCCESS;
    
}