extern "C" void magma_finalize( magma_context *cntxt) { /* -- MAGMA (version 1.6.0) -- Univ. of Tennessee, Knoxville Univ. of California, Berkeley Univ. of Colorado, Denver @date November 2014 Purpose ======= This function finalizes the MAGMA hardware context. Arguments ========= CNTXT (input) MAGMA_CONTEXT Pointer to the MAGMA hardware context to be closed ===================================================================== */ if (cntxt->num_cores > 1) /* Shut down the QUARK scheduler */ QUARK_Delete(cntxt->quark); if (cntxt->num_gpus == 1) { /* Shutdown CUDA and CUBLAS*/ cuCtxDetach( cntxt->gpu_context[0] ); cublasShutdown(); free(cntxt->gpu_context); } free(cntxt); }
CAMLprim value spoc_cublasShutdown(){ cublasStatus cublas_error= CUBLAS_STATUS_SUCCESS; CUBLAS_CHECK_CALL(cublasShutdown()); return Val_unit; }
/* Main */ int test_cublas(void) { cublasStatus status; cudaError_t e; float* h_A; float* h_B; float* h_C; float* h_C_ref; float* d_A = 0; void *vp; float* d_B = 0; float* d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; float error_norm; float ref_norm; float diff; /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the matrices */ h_A = (float*)malloc(n2 * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (float*)malloc(n2 * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ if (cudaMalloc(&vp, n2 * sizeof(d_A[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } d_A = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_B[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } d_B = (float *) vp; if (cudaMalloc(&vp, n2 * sizeof(d_C[0])) != cudaSuccess) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } d_C = (float *) vp; /* Initialize the device matrices with the host matrices */ status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Clear last error */ cublasGetError(); /* Performs operation using cublas */ cublasSgemm('n', 'n', N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Allocate host memory for reading back the result from device memory */ h_C = (float*)malloc(n2 * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } /* Check result against reference */ error_norm = 0; ref_norm = 0; for (i = 0; i < n2; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); e = cudaFree(d_A); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } e = cudaFree(d_B); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } e = cudaFree(d_C); if (e != cudaSuccess) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } return EXIT_SUCCESS; }
SEXP magmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri) { magma_init(); int ndevices; double *h_R; ndevices = INTEGER_VALUE(ngpu); int idevice; for(idevice=0; idevice < ndevices; idevice++) { magma_setdevice(idevice); if(CUBLAS_STATUS_SUCCESS != cublasInit()) { printf("Error: gpu %d: cublasInit failed\n", idevice); magma_finalize(); exit(-1); } } // magma_print_devices(); int In, INB; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); double *PA = NUMERIC_POINTER(A); int i,j; //magma_timestr_t start, end; double gpu_time; printf("Inside magma_dpotrf_m"); /*for(i = 0; i < 5; i++) { for(j = 0; j < 5; j++) { printf("%.8f ", PA[i+j*In]); } printf("\n"); } */ magma_int_t N, status, info, nGPU, n2, lda; clock_t t1, t2; N = In; status = 0; int nGPUs = ndevices; lda = N; n2 = lda*N; if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) { fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); magma_finalize(); exit(-1); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda ); //printf("Modified by Vinay in 2 GPU\n"); //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri){ t1 = clock(); magma_dpotrf_m(nGPUs, MagmaLower, N, h_R, N, &info); t2 = clock (); } else{ t1 = clock(); magma_dpotrf_m(nGPUs, MagmaUpper, N, h_R, N, &info); t2 = clock (); } gpu_time = (double) (t2-t1)/(CLOCKS_PER_SEC) ; // Magma time printf (" magma_dpotrf_m time : %f sec. \n", gpu_time ); if(info != 0) { printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda ); magma_free_pinned(h_R); magma_finalize(); cublasShutdown(); int IZeroTri; IZeroTri = INTEGER_VALUE(zeroTri); if(IZeroTri & lTri) { for(i = 1; i < In; i++) { for(j=0; j< i; j++) { PA[i*In+j] = 0.0; } } } else if(IZeroTri){ for(i = 0; i < In; i++) { for(j=i+1; j < In; j++) { PA[i*In+j] = 0.0; } } } return(R_NilValue); }
int main(int argc, char **argv) { #define test_A(i,j) test_A[(size_t)(j)*N+(i)] #define test_A2(i,j) test_A2[(size_t)(j)*N+(i)] int N,NB,w,LDA,BB; size_t memsize; //bytes int iam, nprocs, mydevice; int ICTXT, nprow, npcol, myprow, mypcol; int i_one = 1, i_zero = 0, i_negone = -1; double d_one = 1.0, d_zero = 0.0, d_negone = -1.0; int IASEED = 100; /* printf("N=?\n"); scanf("%ld",&N); printf("NB=?\n"); scanf("%d", &NB); printf("width of Y panel=?\n"); scanf("%ld",&w); */ if(argc < 4){ printf("invalid arguments N NB memsize(M)\n"); exit(1); } N = atoi(argv[1]); NB = atoi(argv[2]); memsize = (size_t)atoi(argv[3])*1024*1024; BB = (N + NB - 1) / NB; w = memsize/sizeof(double)/BB/NB/NB - 1; assert(w > 0); LDA = N + 0; //padding int do_io = (N <= NSIZE); double llttime; double gflops; nprow = npcol = 1; blacs_pinfo_(&iam, &nprocs); blacs_get_(&i_negone, &i_zero, &ICTXT); blacs_gridinit_(&ICTXT, "R", &nprow, &npcol); blacs_gridinfo_(&ICTXT, &nprow, &npcol, &myprow, &mypcol); #ifdef USE_MIC #ifdef __INTEL_OFFLOAD printf("offload compilation enabled\ninitialize each MIC\n"); offload_init(&iam, &mydevice); #pragma offload target(mic:0) { mkl_peak_mem_usage(MKL_PEAK_MEM_ENABLE); } #else if(isroot) printf("offload compilation not enabled\n"); exit(0); #endif #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasCreate(&worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasInit(); #endif #endif double *test_A = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for chol #ifdef VERIFY double *test_A2 = (double*)memalign(64,(size_t)LDA*N*sizeof(double)); // for verify #endif /*Initialize A */ int i,j; printf("Initialize A ... "); fflush(stdout); llttime = MPI_Wtime(); pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); llttime = MPI_Wtime() - llttime; printf("time %lf\n", llttime); /*print test_A*/ if(do_io){ printf("Original A=\n\n"); matprint(test_A, N, LDA, 'A'); } /*Use directed unblocked Cholesky factorization*/ /* t1 = clock(); Test_dpotrf(test_A2,N); t2 = clock(); printf ("time for unblocked Cholesky factorization on host %f \n", ((float) (t2 - t1)) / CLOCKS_PER_SEC); */ /*print test_A*/ /* if(do_io){ printf("Unblocked result:\n\n"); matprint(test_A2,N,'L'); } */ /*Use tile algorithm*/ Quark *quark = QUARK_New(OOC_NTHREADS); QUARK_DOT_DAG_Enable(quark, 0); #ifdef USE_MIC // mklmem(NB); printf("QUARK MIC affinity binding\n"); QUARK_bind(quark); printf("offload warm up\n"); warmup(quark); #endif QUARK_DOT_DAG_Enable(quark, quark_getenv_int("QUARK_DOT_DAG_ENABLE", 0)); printf("LLT start %lf\n", MPI_Wtime()); llttime = Cholesky(quark,test_A,N,NB,LDA,memsize); printf("LLT end %lf\n", MPI_Wtime()); QUARK_Delete(quark); #ifdef USE_MIC offload_destroy(); #else #ifdef USE_CUBLASV2 { cublasStatus_t cuStatus; for(int r = 0; r < OOC_NTHREADS; r++){ cuStatus = cublasDestroy(worker_handle[r]); assert(cuStatus == CUBLAS_STATUS_SUCCESS); } } #else cublasShutdown(); #endif #endif gflops = (double) N; gflops = gflops/3.0 + 0.5; gflops = gflops*(double)(N)*(double)(N); gflops = gflops/llttime/1024.0/1024.0/1024.0; printf ("N NB memsize(MB) quark_pthreads time Gflops\n%d %d %lf %d %lf %lf\n", N, NB, (double)memsize/1024/1024, OOC_NTHREADS, llttime, gflops); #ifdef USE_MIC #pragma offload target(mic:0) { memsize = mkl_peak_mem_usage(MKL_PEAK_MEM_RESET); } printf("mkl_peak_mem_usage %lf MB\n", (double)memsize/1024.0/1024.0); #endif /*Update and print L*/ if(do_io){ printf("L:\n\n"); matprint(test_A,N,LDA,'L'); } #ifdef VERIFY printf("Verify... "); llttime = MPI_Wtime(); /* * ------------------------ * check difference betwen * test_A and test_A2 * ------------------------ */ /* { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = j; i < N; i++) { double err = (test_A (i, j) - test_A2 (i, j)); err = ABS (err); maxerr = MAX (err, maxerr); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf ("max difference between test_A and test_A2 %lf \n", maxerr); printf ("L2 difference between test_A and test_A2 %lf \n", maxerr2); }; */ /* * ------------------ * over-write test_A2 * ------------------ */ pdmatgen(&ICTXT, "Symm", "Diag", &N, &N, &NB, &NB, test_A2, &LDA, &i_zero, &i_zero, &IASEED, &i_zero, &N, &i_zero, &N, &myprow, &mypcol, &nprow, &npcol); /* * --------------------------------------- * after solve, test_A2 should be identity * --------------------------------------- */ // test_A = chol(B) = L; // test_A2 = B // solve L*L'*X = B // if L is correct, X is identity */ { int uplo = 'L'; const char *uplo_char = ((uplo == (int) 'U') || (uplo == (int) 'u')) ? "U" : "L"; int info = 0; int nrhs = N; int LDA = N; int ldb = N; dpotrs(uplo_char, &N, &nrhs, test_A, &LDA, test_A2, &ldb, &info); assert (info == 0); } { double maxerr = 0; double maxerr2 = 0; for (j = 0; j < N; j++) { for (i = 0; i < N; i++) { double eyeij = (i == j) ? 1.0 : 0.0; double err = (test_A2 (i, j) - eyeij); err = ABS (err); maxerr = MAX (maxerr, err); maxerr2 = maxerr2 + err * err; }; }; maxerr2 = sqrt (ABS (maxerr2)); printf("time %lf\n", MPI_Wtime() - llttime); printf ("max error %lf \n", maxerr); printf ("max L2 error %lf \n", maxerr2); } #endif free(test_A);test_A=NULL; #ifdef VERIFY free(test_A2);test_A2=NULL; #endif blacs_gridexit_(&ICTXT); blacs_exit_(&i_zero); return 0; #undef test_A #undef test_A2 }
~MagmaSpectralSolver() { cublasShutdown(); }
/*@C PetscFinalize - Checks for options to be called at the conclusion of the program. MPI_Finalize() is called only if the user had not called MPI_Init() before calling PetscInitialize(). Collective on PETSC_COMM_WORLD Options Database Keys: + -options_table - Calls PetscOptionsView() . -options_left - Prints unused options that remain in the database . -objects_dump [all] - Prints list of objects allocated by the user that have not been freed, the option all cause all outstanding objects to be listed . -mpidump - Calls PetscMPIDump() . -malloc_dump - Calls PetscMallocDump() . -malloc_info - Prints total memory usage - -malloc_log - Prints summary of memory usage Level: beginner Note: See PetscInitialize() for more general runtime options. .seealso: PetscInitialize(), PetscOptionsView(), PetscMallocDump(), PetscMPIDump(), PetscEnd() @*/ PetscErrorCode PetscFinalize(void) { PetscErrorCode ierr; PetscMPIInt rank; PetscInt nopt; PetscBool flg1 = PETSC_FALSE,flg2 = PETSC_FALSE,flg3 = PETSC_FALSE; #if defined(PETSC_HAVE_AMS) PetscBool flg = PETSC_FALSE; #endif #if defined(PETSC_USE_LOG) char mname[PETSC_MAX_PATH_LEN]; #endif PetscFunctionBegin; if (!PetscInitializeCalled) { printf("PetscInitialize() must be called before PetscFinalize()\n"); PetscFunctionReturn(PETSC_ERR_ARG_WRONGSTATE); } ierr = PetscInfo(NULL,"PetscFinalize() called\n");CHKERRQ(ierr); #if defined(PETSC_SERIALIZE_FUNCTIONS) ierr = PetscFPTDestroy();CHKERRQ(ierr); #endif #if defined(PETSC_HAVE_AMS) ierr = PetscOptionsGetBool(NULL,"-options_gui",&flg,NULL);CHKERRQ(ierr); if (flg) { ierr = PetscOptionsAMSDestroy();CHKERRQ(ierr); } #endif #if defined(PETSC_HAVE_SERVER) flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-server",&flg1,NULL);CHKERRQ(ierr); if (flg1) { /* this is a crude hack, but better than nothing */ ierr = PetscPOpen(PETSC_COMM_WORLD,NULL,"pkill -9 petscwebserver","r",NULL);CHKERRQ(ierr); } #endif ierr = PetscHMPIFinalize();CHKERRQ(ierr); ierr = MPI_Comm_rank(PETSC_COMM_WORLD,&rank);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-malloc_info",&flg2,NULL);CHKERRQ(ierr); if (!flg2) { flg2 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-memory_info",&flg2,NULL);CHKERRQ(ierr); } if (flg2) { ierr = PetscMemoryShowUsage(PETSC_VIEWER_STDOUT_WORLD,"Summary of Memory Usage in PETSc\n");CHKERRQ(ierr); } #if defined(PETSC_USE_LOG) flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-get_total_flops",&flg1,NULL);CHKERRQ(ierr); if (flg1) { PetscLogDouble flops = 0; ierr = MPI_Reduce(&petsc_TotalFlops,&flops,1,MPI_DOUBLE,MPI_SUM,0,PETSC_COMM_WORLD);CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Total flops over all processors %g\n",flops);CHKERRQ(ierr); } #endif #if defined(PETSC_USE_LOG) #if defined(PETSC_HAVE_MPE) mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-log_mpe",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { if (mname[0]) {ierr = PetscLogMPEDump(mname);CHKERRQ(ierr);} else {ierr = PetscLogMPEDump(0);CHKERRQ(ierr);} } #endif mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-log_summary",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { PetscViewer viewer; if (mname[0]) { ierr = PetscViewerASCIIOpen(PETSC_COMM_WORLD,mname,&viewer);CHKERRQ(ierr); ierr = PetscLogView(viewer);CHKERRQ(ierr); ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr); } else { viewer = PETSC_VIEWER_STDOUT_WORLD; ierr = PetscLogView(viewer);CHKERRQ(ierr); } } mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-log_summary_python",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { PetscViewer viewer; if (mname[0]) { ierr = PetscViewerASCIIOpen(PETSC_COMM_WORLD,mname,&viewer);CHKERRQ(ierr); ierr = PetscLogViewPython(viewer);CHKERRQ(ierr); ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr); } else { viewer = PETSC_VIEWER_STDOUT_WORLD; ierr = PetscLogViewPython(viewer);CHKERRQ(ierr); } } ierr = PetscOptionsGetString(NULL,"-log_detailed",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); if (flg1) { if (mname[0]) {ierr = PetscLogPrintDetailed(PETSC_COMM_WORLD,mname);CHKERRQ(ierr);} else {ierr = PetscLogPrintDetailed(PETSC_COMM_WORLD,0);CHKERRQ(ierr);} } mname[0] = 0; ierr = PetscOptionsGetString(NULL,"-log_all",mname,PETSC_MAX_PATH_LEN,&flg1);CHKERRQ(ierr); ierr = PetscOptionsGetString(NULL,"-log",mname,PETSC_MAX_PATH_LEN,&flg2);CHKERRQ(ierr); if (flg1 || flg2) { if (mname[0]) PetscLogDump(mname); else PetscLogDump(0); } #endif /* Free all objects registered with PetscObjectRegisterDestroy() such as PETSC_VIEWER_XXX_(). */ ierr = PetscObjectRegisterDestroyAll();CHKERRQ(ierr); ierr = PetscStackDestroy();CHKERRQ(ierr); flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-no_signal_handler",&flg1,NULL);CHKERRQ(ierr); if (!flg1) { ierr = PetscPopSignalHandler();CHKERRQ(ierr);} flg1 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-mpidump",&flg1,NULL);CHKERRQ(ierr); if (flg1) { ierr = PetscMPIDump(stdout);CHKERRQ(ierr); } flg1 = PETSC_FALSE; flg2 = PETSC_FALSE; /* preemptive call to avoid listing this option in options table as unused */ ierr = PetscOptionsHasName(NULL,"-malloc_dump",&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-objects_dump",&flg1);CHKERRQ(ierr); ierr = PetscOptionsGetBool(NULL,"-options_table",&flg2,NULL);CHKERRQ(ierr); if (flg2) { PetscViewer viewer; ierr = PetscViewerASCIIGetStdout(PETSC_COMM_WORLD,&viewer);CHKERRQ(ierr); ierr = PetscOptionsView(viewer);CHKERRQ(ierr); ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr); } /* to prevent PETSc -options_left from warning */ ierr = PetscOptionsHasName(NULL,"-nox",&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-nox_warning",&flg1);CHKERRQ(ierr); if (!PetscHMPIWorker) { /* worker processes skip this because they do not usually process options */ flg3 = PETSC_FALSE; /* default value is required */ ierr = PetscOptionsGetBool(NULL,"-options_left",&flg3,&flg1);CHKERRQ(ierr); ierr = PetscOptionsAllUsed(&nopt);CHKERRQ(ierr); if (flg3) { if (!flg2) { /* have not yet printed the options */ PetscViewer viewer; ierr = PetscViewerASCIIGetStdout(PETSC_COMM_WORLD,&viewer);CHKERRQ(ierr); ierr = PetscOptionsView(viewer);CHKERRQ(ierr); ierr = PetscViewerDestroy(&viewer);CHKERRQ(ierr); } if (!nopt) { ierr = PetscPrintf(PETSC_COMM_WORLD,"There are no unused options.\n");CHKERRQ(ierr); } else if (nopt == 1) { ierr = PetscPrintf(PETSC_COMM_WORLD,"There is one unused database option. It is:\n");CHKERRQ(ierr); } else { ierr = PetscPrintf(PETSC_COMM_WORLD,"There are %D unused database options. They are:\n",nopt);CHKERRQ(ierr); } } #if defined(PETSC_USE_DEBUG) if (nopt && !flg3 && !flg1) { ierr = PetscPrintf(PETSC_COMM_WORLD,"WARNING! There are options you set that were not used!\n");CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"WARNING! could be spelling mistake, etc!\n");CHKERRQ(ierr); ierr = PetscOptionsLeft();CHKERRQ(ierr); } else if (nopt && flg3) { #else if (nopt && flg3) { #endif ierr = PetscOptionsLeft();CHKERRQ(ierr); } } { PetscThreadComm tcomm_world; ierr = PetscGetThreadCommWorld(&tcomm_world);CHKERRQ(ierr); /* Free global thread communicator */ ierr = PetscThreadCommDestroy(&tcomm_world);CHKERRQ(ierr); } /* List all objects the user may have forgot to free */ ierr = PetscOptionsHasName(NULL,"-objects_dump",&flg1);CHKERRQ(ierr); if (flg1) { MPI_Comm local_comm; char string[64]; ierr = PetscOptionsGetString(NULL,"-objects_dump",string,64,NULL);CHKERRQ(ierr); ierr = MPI_Comm_dup(MPI_COMM_WORLD,&local_comm);CHKERRQ(ierr); ierr = PetscSequentialPhaseBegin_Private(local_comm,1);CHKERRQ(ierr); ierr = PetscObjectsDump(stdout,(string[0] == 'a') ? PETSC_TRUE : PETSC_FALSE);CHKERRQ(ierr); ierr = PetscSequentialPhaseEnd_Private(local_comm,1);CHKERRQ(ierr); ierr = MPI_Comm_free(&local_comm);CHKERRQ(ierr); } PetscObjectsCounts = 0; PetscObjectsMaxCounts = 0; ierr = PetscFree(PetscObjects);CHKERRQ(ierr); #if defined(PETSC_USE_LOG) ierr = PetscLogDestroy();CHKERRQ(ierr); #endif /* Destroy any packages that registered a finalize */ ierr = PetscRegisterFinalizeAll();CHKERRQ(ierr); /* Destroy all the function registration lists created */ ierr = PetscFinalize_DynamicLibraries();CHKERRQ(ierr); /* Print PetscFunctionLists that have not been properly freed ierr = PetscFunctionListPrintAll();CHKERRQ(ierr); */ if (petsc_history) { ierr = PetscCloseHistoryFile(&petsc_history);CHKERRQ(ierr); petsc_history = 0; } ierr = PetscInfoAllow(PETSC_FALSE,NULL);CHKERRQ(ierr); { char fname[PETSC_MAX_PATH_LEN]; FILE *fd; int err; fname[0] = 0; ierr = PetscOptionsGetString(NULL,"-malloc_dump",fname,250,&flg1);CHKERRQ(ierr); flg2 = PETSC_FALSE; ierr = PetscOptionsGetBool(NULL,"-malloc_test",&flg2,NULL);CHKERRQ(ierr); #if defined(PETSC_USE_DEBUG) if (PETSC_RUNNING_ON_VALGRIND) flg2 = PETSC_FALSE; #else flg2 = PETSC_FALSE; /* Skip reporting for optimized builds regardless of -malloc_test */ #endif if (flg1 && fname[0]) { char sname[PETSC_MAX_PATH_LEN]; sprintf(sname,"%s_%d",fname,rank); fd = fopen(sname,"w"); if (!fd) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Cannot open log file: %s",sname); ierr = PetscMallocDump(fd);CHKERRQ(ierr); err = fclose(fd); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fclose() failed on file"); } else if (flg1 || flg2) { MPI_Comm local_comm; ierr = MPI_Comm_dup(MPI_COMM_WORLD,&local_comm);CHKERRQ(ierr); ierr = PetscSequentialPhaseBegin_Private(local_comm,1);CHKERRQ(ierr); ierr = PetscMallocDump(stdout);CHKERRQ(ierr); ierr = PetscSequentialPhaseEnd_Private(local_comm,1);CHKERRQ(ierr); ierr = MPI_Comm_free(&local_comm);CHKERRQ(ierr); } } { char fname[PETSC_MAX_PATH_LEN]; FILE *fd = NULL; fname[0] = 0; ierr = PetscOptionsGetString(NULL,"-malloc_log",fname,250,&flg1);CHKERRQ(ierr); ierr = PetscOptionsHasName(NULL,"-malloc_log_threshold",&flg2);CHKERRQ(ierr); if (flg1 && fname[0]) { int err; if (!rank) { fd = fopen(fname,"w"); if (!fd) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_FILE_OPEN,"Cannot open log file: %s",fname); } ierr = PetscMallocDumpLog(fd);CHKERRQ(ierr); if (fd) { err = fclose(fd); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fclose() failed on file"); } } else if (flg1 || flg2) { ierr = PetscMallocDumpLog(stdout);CHKERRQ(ierr); } } /* Can be destroyed only after all the options are used */ ierr = PetscOptionsDestroy();CHKERRQ(ierr); PetscGlobalArgc = 0; PetscGlobalArgs = 0; #if defined(PETSC_USE_REAL___FLOAT128) ierr = MPI_Type_free(&MPIU___FLOAT128);CHKERRQ(ierr); #if defined(PETSC_HAVE_COMPLEX) ierr = MPI_Type_free(&MPIU___COMPLEX128);CHKERRQ(ierr); #endif ierr = MPI_Op_free(&MPIU_MAX);CHKERRQ(ierr); ierr = MPI_Op_free(&MPIU_MIN);CHKERRQ(ierr); #endif #if defined(PETSC_HAVE_COMPLEX) #if !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX) ierr = MPI_Type_free(&MPIU_C_DOUBLE_COMPLEX);CHKERRQ(ierr); ierr = MPI_Type_free(&MPIU_C_COMPLEX);CHKERRQ(ierr); #endif #endif #if (defined(PETSC_HAVE_COMPLEX) && !defined(PETSC_HAVE_MPI_C_DOUBLE_COMPLEX)) || defined(PETSC_USE_REAL___FLOAT128) ierr = MPI_Op_free(&MPIU_SUM);CHKERRQ(ierr); #endif ierr = MPI_Type_free(&MPIU_2SCALAR);CHKERRQ(ierr); #if defined(PETSC_USE_64BIT_INDICES) || !defined(MPI_2INT) ierr = MPI_Type_free(&MPIU_2INT);CHKERRQ(ierr); #endif ierr = MPI_Op_free(&PetscMaxSum_Op);CHKERRQ(ierr); ierr = MPI_Op_free(&PetscADMax_Op);CHKERRQ(ierr); ierr = MPI_Op_free(&PetscADMin_Op);CHKERRQ(ierr); /* Destroy any known inner MPI_Comm's and attributes pointing to them Note this will not destroy any new communicators the user has created. If all PETSc objects were not destroyed those left over objects will have hanging references to the MPI_Comms that were freed; but that is ok because those PETSc objects will never be used again */ { PetscCommCounter *counter; PetscMPIInt flg; MPI_Comm icomm; union {MPI_Comm comm; void *ptr;} ucomm; ierr = MPI_Attr_get(PETSC_COMM_SELF,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRQ(ierr); if (flg) { icomm = ucomm.comm; ierr = MPI_Attr_get(icomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); if (!flg) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_CORRUPT,"Inner MPI_Comm does not have expected tag/name counter, problem with corrupted memory"); ierr = MPI_Attr_delete(PETSC_COMM_SELF,Petsc_InnerComm_keyval);CHKERRQ(ierr); ierr = MPI_Attr_delete(icomm,Petsc_Counter_keyval);CHKERRQ(ierr); ierr = MPI_Comm_free(&icomm);CHKERRQ(ierr); } ierr = MPI_Attr_get(PETSC_COMM_WORLD,Petsc_InnerComm_keyval,&ucomm,&flg);CHKERRQ(ierr); if (flg) { icomm = ucomm.comm; ierr = MPI_Attr_get(icomm,Petsc_Counter_keyval,&counter,&flg);CHKERRQ(ierr); if (!flg) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_CORRUPT,"Inner MPI_Comm does not have expected tag/name counter, problem with corrupted memory"); ierr = MPI_Attr_delete(PETSC_COMM_WORLD,Petsc_InnerComm_keyval);CHKERRQ(ierr); ierr = MPI_Attr_delete(icomm,Petsc_Counter_keyval);CHKERRQ(ierr); ierr = MPI_Comm_free(&icomm);CHKERRQ(ierr); } } ierr = MPI_Keyval_free(&Petsc_Counter_keyval);CHKERRQ(ierr); ierr = MPI_Keyval_free(&Petsc_InnerComm_keyval);CHKERRQ(ierr); ierr = MPI_Keyval_free(&Petsc_OuterComm_keyval);CHKERRQ(ierr); #if defined(PETSC_HAVE_CUDA) { PetscInt p; for (p = 0; p < PetscGlobalSize; ++p) { if (p == PetscGlobalRank) cublasShutdown(); ierr = MPI_Barrier(PETSC_COMM_WORLD);CHKERRQ(ierr); } } #endif if (PetscBeganMPI) { #if defined(PETSC_HAVE_MPI_FINALIZED) PetscMPIInt flag; ierr = MPI_Finalized(&flag);CHKERRQ(ierr); if (flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"MPI_Finalize() has already been called, even though MPI_Init() was called by PetscInitialize()"); #endif ierr = MPI_Finalize();CHKERRQ(ierr); } /* Note: In certain cases PETSC_COMM_WORLD is never MPI_Comm_free()ed because the communicator has some outstanding requests on it. Specifically if the flag PETSC_HAVE_BROKEN_REQUEST_FREE is set (for IBM MPI implementation). See src/vec/utils/vpscat.c. Due to this the memory allocated in PetscCommDuplicate() is never freed as it should be. Thus one may obtain messages of the form [ 1] 8 bytes PetscCommDuplicate() line 645 in src/sys/mpiu.c indicating the memory was not freed. */ ierr = PetscMallocClear();CHKERRQ(ierr); PetscInitializeCalled = PETSC_FALSE; PetscFinalizeCalled = PETSC_TRUE; PetscFunctionReturn(ierr); } #if defined(PETSC_MISSING_LAPACK_lsame_) PETSC_EXTERN int lsame_(char *a,char *b) { if (*a == *b) return 1; if (*a + 32 == *b) return 1; if (*a - 32 == *b) return 1; return 0; } #endif #if defined(PETSC_MISSING_LAPACK_lsame) PETSC_EXTERN int lsame(char *a,char *b) { if (*a == *b) return 1; if (*a + 32 == *b) return 1; if (*a - 32 == *b) return 1; return 0; }
SEXP magmaCholeskyFinal(SEXP A, SEXP n, SEXP NB, SEXP id, SEXP zeroTri, SEXP lowerTri) { magma_init(); // magma_print_devices(); double *h_R; int In, INB, ID; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); ID = INTEGER_VALUE(id); double *PA = NUMERIC_POINTER(A); int i,j; magma_int_t N, n2, lda, status, info, max_size; N=In; lda = N; n2 = lda*N; /* for(i = 0; i < In; i++) { for(j = 0; j < In; j++) { printf("%.8f ", PA[i+j*In]); } printf("\n"); } */ if ( MAGMA_SUCCESS != magma_malloc_pinned( (void**) &h_R, (n2)*sizeof(double) )) { fprintf( stderr, "!!!! magma_malloc_pinned failed for: %s\n", h_R ); magma_finalize(); exit(-1); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, PA, &lda, h_R, &lda ); N = In; status = 0; magma_setdevice(ID); //printf("Modified by Vinay in one GPU\n"); //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri) magma_dpotrf(MagmaLower, N, h_R, N, &info); else magma_dpotrf(MagmaUpper, N, h_R, N, &info); if(info != 0) { printf("magma_dpotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } lapackf77_dlacpy( MagmaUpperLowerStr, &N, &N, h_R, &lda, PA, &lda ); //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} magma_free_pinned(h_R); magma_finalize(); cublasShutdown(); /* int IZeroTri; IZeroTri = INTEGER_VALUE(zeroTri); if(IZeroTri & lTri) { for(i = 1; i < In; i++) { for(j=0; j< i; j++) { PA[i*In+j] = 0.0; } } } else if(IZeroTri) for(i = 0; i < In; i++) { for(j=i+1; j < In; j++) { PA[i*In+j] = 0.0; } }*/ return(R_NilValue); }
/* ========================================================================== */ int sci_gpuLU(char *fname) { CheckRhs(1,2); CheckLhs(2,2); #ifdef WITH_CUDA cublasStatus status; #endif SciErr sciErr; int* piAddr_A = NULL; double* h_A = NULL; double* hi_A = NULL; int rows_A; int cols_A; int* piAddr_Opt = NULL; double* option = NULL; int rows_Opt; int cols_Opt; void* d_A = NULL; int na; void* pvPtr = NULL; int size_A = sizeof(double); bool bComplex_A = FALSE; int inputType_A; int inputType_Opt; double res; int posOutput = 1; try { sciErr = getVarAddressFromPosition(pvApiCtx, 1, &piAddr_A); if(sciErr.iErr) throw sciErr; if(Rhs == 2) { sciErr = getVarAddressFromPosition(pvApiCtx, 2, &piAddr_Opt); if(sciErr.iErr) throw sciErr; sciErr = getVarType(pvApiCtx, piAddr_Opt, &inputType_Opt); if(sciErr.iErr) throw sciErr; if(inputType_Opt == sci_matrix) { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_Opt, &rows_Opt, &cols_Opt, &option); if(sciErr.iErr) throw sciErr; } else throw "Option syntax is [number,number]."; } else { rows_Opt=1; cols_Opt=2; option = (double*)malloc(2*sizeof(double)); option[0]=0; option[1]=0; } if(rows_Opt != 1 || cols_Opt != 2) throw "Option syntax is [number,number]."; if((int)option[1] == 1 && !isGpuInit()) throw "gpu is not initialised. Please launch gpuInit() before use this function."; sciErr = getVarType(pvApiCtx, piAddr_A, &inputType_A); if(sciErr.iErr) throw sciErr; #ifdef WITH_CUDA if (useCuda()) { if(inputType_A == sci_pointer) { sciErr = getPointer(pvApiCtx, piAddr_A, (void**)&pvPtr); if(sciErr.iErr) throw sciErr; gpuMat_CUDA* gmat; gmat = static_cast<gpuMat_CUDA*>(pvPtr); if(!gmat->useCuda) throw "Please switch to OpenCL mode before use this data."; rows_A=gmat->rows; cols_A=gmat->columns; if(gmat->complex) { bComplex_A = TRUE; size_A = sizeof(cuDoubleComplex); d_A=(cuDoubleComplex*)gmat->ptr->get_ptr(); } else d_A=(double*)gmat->ptr->get_ptr(); // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; na = rows_A * cols_A; } else if(inputType_A == 1) { // Get size and data if(isVarComplex(pvApiCtx, piAddr_A)) { sciErr = getComplexMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A, &hi_A); if(sciErr.iErr) throw sciErr; size_A = sizeof(cuDoubleComplex); bComplex_A = TRUE; } else { sciErr = getMatrixOfDouble(pvApiCtx, piAddr_A, &rows_A, &cols_A, &h_A); if(sciErr.iErr) throw sciErr; } na = rows_A * cols_A; // Initialize CUBLAS status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Allocate device memory status = cublasAlloc(na, size_A, (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Initialize the device matrices with the host matrices if(!bComplex_A) { status = cublasSetMatrix(rows_A,cols_A, sizeof(double), h_A, rows_A, (double*)d_A, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else writecucomplex(h_A, hi_A, rows_A, cols_A, (cuDoubleComplex *)d_A); } else throw "Bad argument type."; cuDoubleComplex resComplex; // Performs operation if(!bComplex_A) status = decomposeBlockedLU(rows_A, cols_A, rows_A, (double*)d_A, 1); // else // resComplex = cublasZtrsm(na,(cuDoubleComplex*)d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; // Put the result in scilab switch((int)option[0]) { case 2 : case 1 : sciprint("The first option must be 0 for this function. Considered as 0.\n"); case 0 : // Keep the result on the Host. { // Put the result in scilab if(!bComplex_A) { double* h_res = NULL; sciErr=allocMatrixOfDouble(pvApiCtx, Rhs + posOutput, rows_A, cols_A, &h_res); if(sciErr.iErr) throw sciErr; status = cublasGetMatrix(rows_A,cols_A, sizeof(double), (double*)d_A, rows_A, h_res, rows_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; } else { sciErr = createComplexMatrixOfDouble(pvApiCtx, Rhs + posOutput, 1, 1, &resComplex.x,&resComplex.y); if(sciErr.iErr) throw sciErr; } LhsVar(posOutput)=Rhs+posOutput; posOutput++; break; } default : throw "First option argument must be 0 or 1 or 2."; } switch((int)option[1]) { case 0 : // Don't keep the data input on Device. { if(inputType_A == sci_matrix) { status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) throw status; d_A = NULL; } break; } case 1 : // Keep data of the fisrt argument on Device and return the Device pointer. { if(inputType_A == sci_matrix) { gpuMat_CUDA* dptr; gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A}; dptr=new gpuMat_CUDA(tmp); dptr->useCuda = true; dptr->ptr->set_ptr((double*)d_A); if(bComplex_A) dptr->complex=TRUE; else dptr->complex=FALSE; sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr); if(sciErr.iErr) throw sciErr; LhsVar(posOutput)=Rhs+posOutput; } else throw "The first input argument is already a GPU variable."; posOutput++; break; } default : throw "Second option argument must be 0 or 1."; } // Shutdown status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) throw status; } #endif #ifdef WITH_OPENCL if (!useCuda()) { throw "not implemented with OpenCL."; } #endif if(Rhs == 1) { free(option); option = NULL; } if(posOutput < Lhs+1) throw "Too many output arguments."; if(posOutput > Lhs+1) throw "Too few output arguments."; PutLhsVar(); return 0; } catch(const char* str) { Scierror(999,"%s\n",str); } catch(SciErr E) { printError(&E, 0); } #ifdef WITH_CUDA catch(cudaError_t cudaE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE); } catch(cublasStatus CublasE) { GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1); } if (useCuda()) { if(inputType_A == 1 && d_A != NULL) cudaFree(d_A); } #endif #ifdef WITH_OPENCL if (!useCuda()) { Scierror(999,"not implemented with OpenCL.\n"); } #endif if(Rhs == 1 && option != NULL) free(option); return EXIT_FAILURE; }
SEXP smagmaCholeskyFinal_m(SEXP A, SEXP n, SEXP NB, SEXP zeroTri, SEXP ngpu, SEXP lowerTri) { magma_init(); int ndevices; ndevices = INTEGER_VALUE(ngpu); int idevice; for(idevice=0; idevice < ndevices; idevice++) { magma_setdevice(idevice); if(CUBLAS_STATUS_SUCCESS != cublasInit()) { printf("Error: gpu %d: cublasInit failed\n", idevice); magma_finalize(); exit(-1); } } // magma_print_devices(); int In, INB; In = INTEGER_VALUE(n); INB = INTEGER_VALUE(NB); double *PA = NUMERIC_POINTER(A); float *sPA = calloc(In*In, sizeof(float)); int i,j; for(i = 0; i < In; i++) { for(j = 0; j < In; j++) { sPA[i*In + j] = (float) PA[i*In + j]; } } magma_int_t N, status, info, nGPUs; N = In; status = 0; nGPUs = ndevices; //INB = magma_get_dpotrf_nb(N); // INB = 224; // printf("INB = %d\n", INB); //ngpu = ndevices; // printf("ngpu = %d\n", ngpu); //max_size = INB*(1+N/(INB*ndevices))*INB*((N+INB-1)/INB); // printf("max_size = %d\n", max_size); //int imax_size = max_size; //double *dA; //magma_dmalloc_pinned((void**)&dA, In*In*sizeof(double)); //ldda = (1+N/(INB*ndevices))*INB; // printf("ldda = %d\n", ldda); //magma_dsetmatrix_1D_row_bcyclic(N, N, PA, N, dA, ldda, ngpu, INB); //magma_dpotrf_mgpu(ngpu, MagmaLower, N, dA, ldda, &info); int lTri; lTri = INTEGER_VALUE(lowerTri); if(lTri) magma_spotrf_m(nGPUs, MagmaLower, N, sPA, N, &info); else magma_spotrf_m(nGPUs, MagmaUpper, N, sPA, N, &info); if(info != 0) { printf("magma_spotrf returned error %d: %s.\n", (int) info, magma_strerror(info)); } //magma_dgetmatrix_1D_row_bcyclic(N, N, dA, ldda, PA, N, ngpu, INB); //for(dev = 0; dev < ndevices; dev++) //{ //magma_setdevice(dev); //cudaFree(dA[dev]); //} magma_finalize(); cublasShutdown(); //caste sPA back to double and set upper or lower triangle to zero if necessary: int IZeroTri = INTEGER_VALUE(zeroTri); int zeroUTri = IZeroTri & lTri; int zeroLTri = IZeroTri & !lTri; if(!IZeroTri) { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { PA[i*In + j] = (double) sPA[i*In + j]; } } } else if(zeroUTri) { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { if(i > j) PA[i*In + j] = 0; else PA[i*In + j] = (double) sPA[i*In + j]; } } } else { for(i = 1; i< In; i++) { for(j=1; j < In; j++) { if(i < j) PA[i*In + j] = 0; else PA[i*In + j] = (double) sPA[i*In + j]; } } } UNPROTECT(1); free(sPA); return(R_NilValue); }
int main( int argc,char** argv) { printf("hello world\n"); if (!InitCUDA()) { return 0; } int iter = 1000; int trainnum = 20; bool isProfiler = false; int intProfiler = 0; int testnum = -1; float maxtime = 0.0f; cutGetCmdLineArgumenti(argc, (const char**) argv, "train", &trainnum); cutGetCmdLineArgumenti(argc, (const char**) argv, "iter", &iter); cutGetCmdLineArgumenti(argc, (const char**) argv, "profiler", &intProfiler); cutGetCmdLineArgumenti(argc, (const char**) argv, "test", &testnum); cutGetCmdLineArgumentf(argc, (const char**) argv, "maxtime", &maxtime); printf("%d\n", intProfiler); if(intProfiler) { isProfiler = true; } if(testnum == -1) testnum = trainnum /2; printf("Iter = %d\n", iter); printf("TrainNum = %d\n", trainnum); printf("TestNum = %d\n", testnum); CUT_DEVICE_INIT(argc, argv); cublasStatus status; status = cublasInit(); if(status != CUBLAS_STATUS_SUCCESS) { printf("Can't init cublas\n"); printf("%s\n", cudaGetErrorString(cudaGetLastError())); return -1; } Image* imageList = new Image[trainnum+testnum]; read64("my_optdigits.tra", imageList, trainnum + testnum); const int warmUpTime = 3; if(!isProfiler) { freopen("verbose.txt", "w", stdout); for(int i=0;i< warmUpTime;i++) { runImage(argc, argv, imageList, trainnum < warmUpTime ? trainnum : warmUpTime, 0, 10, false, 0.0f); } freopen("CON", "w", stdout); printf("Warm-up complete.\n\n\n"); } #ifdef _DEBUG freopen("out.txt", "w", stdout); #endif // _DEBUG runImage(argc, argv, imageList, trainnum, testnum, iter, true, maxtime); freopen("CON", "w", stdout); delete[] imageList; //TestReduce(); cublasShutdown(); if(!isProfiler) { CUT_EXIT(argc, argv); } //getchar(); return 0; }
inline void ShutdownTensorEngine( void ){ cublasShutdown(); }
/* Main */ int main(int argc, char** argv) { if (argc!=5){ fprintf (stderr, "Usage: %s <sizeM> <sizeN> <sizeK> <Nb iter>\n",argv[0]); exit(0); } const int M=strtol(argv[1],0,0); const int N=strtol(argv[2],0,0); const int K=strtol(argv[3],0,0); const int NBITER=strtol(argv[4],0,0); const int NA= M * K; const int NB= K * N; const int NC= M * N; real* h_A; real* h_B; real* h_C; const real alpha = 1.0f; const real beta = 0.0f; #ifdef NVIDIA cublasStatus status; real* d_A = 0; real* d_B = 0; real* d_C = 0; #endif #ifdef COMPARE real* h_C_ref; real error_norm; real ref_norm; real diff; #endif /* Allocate host memory for the matrices */ h_A = (real*)malloc(NA * sizeof(h_A[0])); if (h_A == 0) { fprintf (stderr, "!!!! host memory allocation error (A)\n"); return EXIT_FAILURE; } h_B = (real*)malloc(NB * sizeof(h_B[0])); if (h_B == 0) { fprintf (stderr, "!!!! host memory allocation error (B)\n"); return EXIT_FAILURE; } h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } for (int i = 0; i < NA; ++i) h_A[i] = M_PI+(real)i; for (int i = 0; i < NB; ++i) h_B[i] = M_PI+(real)i; #ifdef NVIDIA /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate device memory for the matrices */ status = cublasAlloc(NA, sizeof(d_A[0]), (void**)&d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (A)\n"); return EXIT_FAILURE; } status = cublasAlloc(NB, sizeof(d_B[0]), (void**)&d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (B)\n"); return EXIT_FAILURE; } status = cublasAlloc(NC, sizeof(d_C[0]), (void**)&d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (C)\n"); return EXIT_FAILURE; } /* Initialize the device matrices with the host matrices */ status = cublasSetVector(NA, sizeof(h_A[0]), h_A, 1, d_A, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(NB, sizeof(h_B[0]), h_B, 1, d_B, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write B)\n"); return EXIT_FAILURE; } status = cublasSetVector(NC, sizeof(h_C[0]), h_C, 1, d_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write C)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); #endif #ifdef COMPARE /* Performs operation using plain C code */ for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Allocate host memory for reading back the result from device memory */ h_C = (real*)malloc(NC * sizeof(h_C[0])); if (h_C == 0) { fprintf (stderr, "!!!! host memory allocation error (C)\n"); return EXIT_FAILURE; } #endif #ifdef NVIDIA /* Performs operation using cublas */ for (int i=0;i<NBITER;i++) //We must Change the order of the parameter as cublas take //matrix as colomn major and C matrix is row major cublasSgemm('n', 'n', N, M, K, alpha, d_B, N, d_A, K, beta, d_C, N); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(NC, sizeof(h_C[0]), d_C, 1, h_C, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read C)\n"); return EXIT_FAILURE; } #elif defined( CXGEMM ) for (int i=0;i<NBITER;i++) c_xgemm(M,N,K, alpha, h_A, h_B, beta, h_C); #else char transa='N', transb='N'; for (int i=0;i<NBITER;i++) sgemm_(&transb, &transa, &N, &M, &K, &alpha, h_B, &N, h_A, &K, &beta, h_C, &N); #endif #ifdef COMPARE /* Check result against reference */ error_norm = 0; ref_norm = 0; for (int i = 0; i < NC; ++i) { diff = h_C_ref[i] - h_C[i]; error_norm += diff * diff; ref_norm += h_C_ref[i] * h_C_ref[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) { fprintf (stderr, "!!!! reference norm is 0\n"); return EXIT_FAILURE; } printf( "Test %s\n", (error_norm / ref_norm < 1e-6f) ? "PASSED" : "FAILED"); #endif /* Memory clean up */ free(h_A); free(h_B); free(h_C); #ifdef NVIDIA status = cublasFree(d_A); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (A)\n"); return EXIT_FAILURE; } status = cublasFree(d_B); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (B)\n"); return EXIT_FAILURE; } status = cublasFree(d_C); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (C)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } #endif // if (argc <= 1 || strcmp(argv[1], "-noprompt")) { // printf("\nPress ENTER to exit...\n"); // getchar(); // } return EXIT_SUCCESS; }
int main(void) { cublasStatus status; float* h_image; float* h_covariance; float* d_image; float* d_covariance; float alpha = 1.0f; float beta = 0.0f; int imgsize = N * L; //int i; FILE *fp1, *fp2; /* Initialize CUBLAS */ status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } /* Allocate host memory for the image */ h_image = (float*)malloc(imgsize * sizeof(float)); if (h_image == 0) { fprintf (stderr, "!!!! host memory allocation error (image)\n"); return EXIT_FAILURE; } h_covariance = (float*)calloc(L * L, sizeof(float)); if (h_covariance == 0) { fprintf (stderr, "!!!! host memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Fill the image with test data for (i = 0; i < imgsize; i++) { h_image[i] = rand() / (float)RAND_MAX; }*/ fp1 = fopen("image.dat","rb"); fread(h_image, sizeof(float), imgsize, fp1); printf("Valor de image[0]: %f\n", h_image[8]); /* Allocate device memory */ status = cublasAlloc(imgsize, sizeof(float), (void**)&d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (image)\n"); return EXIT_FAILURE; } status = cublasAlloc(L * L, sizeof(float), (void**)&d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device memory allocation error (covariance)\n"); return EXIT_FAILURE; } /* Copy image to device memory */ status = cublasSetVector(imgsize, sizeof(float), h_image, 1, d_image, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write A)\n"); return EXIT_FAILURE; } status = cublasSetVector(L * L, sizeof(float), h_covariance, 1, d_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (write covariance)\n"); return EXIT_FAILURE; } /* Clear last error */ cublasGetError(); /* Calculate covariance matrix using cublas */ cublasSgemm('n', 't', L, L, N, alpha, d_image, L, d_image, L, beta, d_covariance, L); status = cublasGetError(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } /* Read the result back */ status = cublasGetVector(L * L, sizeof(float), d_covariance, 1, h_covariance, 1); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! device access error (read covariance)\n"); return EXIT_FAILURE; } fp2 = fopen("covariance.dat","wb"); fwrite(h_covariance, sizeof(float), L*L, fp2); printf("Valor de covariance[8]: %f\n", h_covariance[8]); /* Memory clean up */ free(h_image); free(h_covariance); status = cublasFree(d_image); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (image)\n"); return EXIT_FAILURE; } status = cublasFree(d_covariance); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! memory free error (covariance)\n"); return EXIT_FAILURE; } /* Shutdown */ status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error (A)\n"); return EXIT_FAILURE; } fclose(fp1); fclose(fp2); printf("\nPress ENTER to exit...\n"); getchar(); return EXIT_SUCCESS; }