void cuda_initialize() { CUDA_CHECK(cudaStreamCreate(&g_context.stream)); CUBLAS_CHECK(cublasCreate_v2(&g_context.cublas_handle)); CUBLAS_CHECK(cublasSetStream(g_context.cublas_handle, g_context.stream)); // CUDNN_CHECK(cudnnCreate(&g_context.cudnn_handle)); // CUDNN_CHECK(cudnnSetStream(g_context.cudnn_handle, g_context.stream)); }
void init_GPU_data(GlobalParams &p) { int n_known = p.nRegions; int n_total = p.pred_dist.n_rows; int n_pred = n_total-n_known; scatr_magma_init(); arma::mat dist12 = p.pred_dist(arma::span(0,n_known-1), arma::span(n_known, n_total-1)); arma::mat dist22 = p.pred_dist(arma::span(n_known, n_total-1), arma::span(n_known, n_total-1)); checkCublasError( cublasCreate_v2(&p.handle), "handle (Create)" ); cudaMalloc((void**) &p.d_dist12, n_known*n_pred*sizeof(double)); checkCudaError("dist12 (Malloc)"); checkCublasError( cublasSetMatrix(n_known, n_pred, sizeof(double), dist12.memptr(), n_known, p.d_dist12, n_known), "dist12 (Set)" ); cudaMalloc((void**) &p.d_dist22, n_pred*n_pred * sizeof(double)); checkCudaError("dist22 (Malloc)"); checkCublasError( cublasSetMatrix(n_pred, n_pred, sizeof(double), dist22.memptr(), n_pred, p.d_dist22, n_pred), "dist22 (Set)" ); cudaMalloc((void**) &p.d_cov12, n_known * n_pred * sizeof(double)); checkCudaError("cov12 (Malloc)"); cudaMalloc((void**) &p.d_cov22, n_pred * n_pred * sizeof(double)); checkCudaError("cov22 (Malloc)"); cudaMalloc((void**) &p.d_invcov11, n_known * n_known * sizeof(double)); checkCudaError("invcov11 (Malloc)"); cudaMalloc((void**) &p.d_tmp, n_pred * n_known * sizeof(double)); checkCudaError("tmp (Malloc)"); }
Dragon::Dragon() : mode(Dragon::CPU), solver_count(1), root_solver(true), cublas_handle(NULL), curand_generator(NULL){ if (cublasCreate_v2(&cublas_handle) != CUBLAS_STATUS_SUCCESS) LOG(ERROR) << "Couldn't create cublas handle."; if (curandCreateGenerator(&curand_generator, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || curandSetPseudoRandomGeneratorSeed(curand_generator, cluster_seedgen()) != CURAND_STATUS_SUCCESS) LOG(ERROR) << "Couldn't create curand generator."; }
int main() { time_t start = time(NULL); int dim = L * (nmax + 1); const real epsg = EPSG; const real epsf = EPSF; const real epsx = EPSX; const int maxits = MAXITS; stpscal = 0.5; int info; real* x; int* nbd; real* l; real* u; memAlloc<real>(&x, dim); memAlloc<int>(&nbd, dim); memAlloc<real>(&l, dim); memAlloc<real>(&u, dim); memAllocHost<real>(&f_tb_host, &f_tb_dev, 1); cudaSetDeviceFlags(cudaDeviceMapHost); cublasCreate_v2(&cublasHd); U = 1; J = 0.1; mu = 0.5; initProb(x, nbd, l, u, dim); lbfgsbminimize(dim, 4, x, epsg, epsf, epsx, maxits, nbd, l, u, info); printf("info: %d\n", info); printf("f: %e\n", *f_tb_host); real* x_host = new real[dim]; memCopy(x_host, x, dim * sizeof(real), cudaMemcpyDeviceToHost); printf("x: "); for (int i = 0; i < dim; i++) { printf("%f, ", x_host[i]); } printf("\n"); memFreeHost(f_tb_host); memFree(x); memFree(nbd); memFree(l); memFree(u); cublasDestroy_v2(cublasHd); cudaDeviceReset(); time_t end = time(NULL); printf("Runtime: %ld", end-start); }
void Dragon::set_device(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) return; // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. // reset Device must reset handle and generator??? CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle) cublasDestroy_v2(Get().cublas_handle); if (Get().curand_generator) curandDestroyGenerator(Get().curand_generator); cublasCreate_v2(&Get().cublas_handle); curandCreateGenerator(&Get().curand_generator, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(Get().curand_generator, cluster_seedgen()); }
value_type square_residual( value_type* ug, value_type thickness ) { int current_id; cuda_assert( cudaGetDevice(¤t_id) ); if ( current_id != config.device_id ) cuda_assert( cudaSetDevice( config.device_id ) ); update_I_diff(ug, thickness); value_type residual; cublasHandle_t handle; cublas_assert( cublasCreate_v2(&handle) ); cublas_assert( cublasDdot_v2( handle, static_cast<int>(config.max_dim*config.tilt_size), data.I_diff, 1, data.I_diff, 1, &residual ) ); cublas_assert( cublasDestroy_v2(handle) ); return residual; }
extern "C" magma_int_t magma_zgeqrf_batched( magma_int_t m, magma_int_t n, magmaDoubleComplex **dA_array, magma_int_t ldda, magmaDoubleComplex **tau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ) { printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t nb = 32; magma_int_t nnb = 8; magma_int_t i, k, ib=nb, jb=nnb; magma_int_t ldw, ldt, ldr, offset; cublasHandle_t myhandle; cublasCreate_v2(&myhandle); magmaDoubleComplex **dW0_displ = NULL; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dW4_displ = NULL; magmaDoubleComplex **dW5_displ = NULL; magmaDoubleComplex *dwork = NULL; magmaDoubleComplex *dT = NULL; magmaDoubleComplex *dR = NULL; magmaDoubleComplex **dR_array = NULL; magmaDoubleComplex **dT_array = NULL; magmaDoubleComplex **cpuAarray = NULL; magmaDoubleComplex **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); // used in zlarfb magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); ldt = ldr = min(nb, min_mn); magma_zmalloc(&dwork, (2 * nb * n) * batchCount); magma_zmalloc(&dR, ldr * n * batchCount); magma_zmalloc(&dT, ldt * ldt * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(magmaDoubleComplex*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_zlaset_q(MagmaFull, ldr, n*batchCount , MAGMA_Z_ZERO, MAGMA_Z_ZERO, dR, ldr, queue); magmablas_zlaset_q(MagmaFull, ldt, ldt*batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dT, ldt, queue); zset_pointer(dR_array, dR, 1, 0, 0, ldr*min(nb, min_mn), batchCount, queue); zset_pointer(dT_array, dT, 1, 0, 0, ldt*min(nb, min_mn), batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++) { magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1); magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dT_array, 1, cpuTarray, 1); magmablasSetKernelStream(NULL); for(i=0; i<min_mn; i+=nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_zgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, dR_array, ldr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, myhandle, queue); //=============================================== // end of panel //=============================================== //direct panel matrix V in dW0_displ, magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); // copy the upper part of V into dR zgeqrf_copy_upper_batched(ib, jb, dW0_displ, ldda, dR_array, ldr, batchCount, queue); //=============================================== // update trailing matrix //=============================================== //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; zset_pointer(dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; zset_pointer(dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); if( (n-ib-i) > 0) { // set the diagonal of v as one and the upper triangular part as zero magmablas_zlaset_batched(MagmaUpper, ib, ib, MAGMA_Z_ZERO, MAGMA_Z_ONE, dW0_displ, ldda, batchCount, queue); magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_zlarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_array, ldt, dW4_displ, nb*ldt, batchCount, myhandle, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i) > 100 && (n-i-ib) > 100) { // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_device_sync(); for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // the stream gemm must take cpu pointer magma_zlarfb_gpu_gemm(MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k], ldt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_zdisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_zlarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const magmaDoubleComplex**)dW0_displ, ldda, (const magmaDoubleComplex**)dT_array, ldt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, myhandle, queue); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update magmablas_zlacpy_batched(MagmaUpper, ib, ib, dR_array, ldr, dW0_displ, ldda, batchCount, queue); } for(k=0; k<nbstreams; k++) { magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); cublasDestroy_v2(myhandle); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); return arginfo; }
void InitializeGlut(int *argc, char *argv[]) { int i,j; glutInit(argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); glutInitWindowSize(screenwidth, screenheight); glutCreateWindow(argv[0]); glutDisplayFunc(Display); glutKeyboardFunc(Keyboard); // Support mapped pinned allocations cudaSetDeviceFlags(cudaDeviceMapHost); cudaGLSetGLDevice(0); cublasCreate_v2(&cublasHd); glewInit(); GLint max_texture_size; glGetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size); if(max_texture_size < screenwidth || screenwidth < screenheight) { printf("Max size of texttur(%d) is less than screensize(%d, %d)\n", max_texture_size, screenwidth, screenheight); exit(0); } //Create the textures glActiveTextureARB(GL_TEXTURE0_ARB); // 처리용 텍스쳐 2장 // Q. 왜 2장일까? glGenTextures(2, Processed_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[0]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Processed_Texture[1]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Site용 텍스쳐 // Q. 처리용과 별개인 이유는? glGenTextures(1, &Site_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Site_Texture); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Registers the texture or renderbuffer object specified by image for access by CUDA. // A handle to the registered object is returned as resource cutilSafeCall(cudaGraphicsGLRegisterImage(&grSite, Site_Texture, GL_TEXTURE_RECTANGLE_NV, cudaGraphicsMapFlagsReadOnly)); // 에너지용 텍스쳐 // 처리용과 동일한 2장 // Q. 왜?? glGenTextures(2, Energy_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[0]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); glBindTexture(GL_TEXTURE_RECTANGLE_NV, Energy_Texture[1]); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA32F_ARB, screenwidth+2, screenheight+2, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // 인덱스용 텍스쳐 // 인덱스를 컬러로 표현 glGenTextures(1, &IndexColor_Texture); glBindTexture(GL_TEXTURE_RECTANGLE_NV, IndexColor_Texture); glTexImage2D(GL_TEXTURE_RECTANGLE_NV, 0, GL_RGBA, screenwidth, screenheight, 0, GL_RGBA, GL_FLOAT, NULL); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_S, GL_CLAMP); glTexParameteri(GL_TEXTURE_RECTANGLE_NV, GL_TEXTURE_WRAP_T, GL_CLAMP); // Render Buffer Object glGenFramebuffersEXT(1, &RB_object); glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, RB_object); glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA32F_ARB, screenwidth+2, screenheight+2); // Frame(?) Buffer Object glGenFramebuffersEXT(1, &FB_objects); // ???? // NVIDIA 확인이라는 점만 확인 // http://developer.download.nvidia.com/opengl/specs/nvOpenGLspecs.pdf glGetQueryiv(GL_SAMPLES_PASSED_ARB, GL_QUERY_COUNTER_BITS_ARB, &oq_bitsSupported); glGenQueriesARB(1, &occlusion_query); InitCg(); // 미리 컴파일된 화면 픽셀 목록 ScreenPointsList = glGenLists(1); glNewList(ScreenPointsList, GL_COMPILE); glBegin(GL_POINTS); for (i=0; i<screenwidth; i++) for (j=0; j<screenheight; j++) glVertex2f(i+1.5, j+1.5); glEnd(); glEndList(); }
/** Purpose ------- ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] dA COMPLEX_16 array on the GPU, dimension (LDDA,N) On entry, the Hermitian matrix dA. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_zposv_comp ********************************************************************/ extern "C" magma_int_t magma_zpotrf_batched( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex **dA_array, magma_int_t ldda, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) double d_alpha = -1.0; double d_beta = 1.0; cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); magma_int_t arginfo = 0; if ( uplo != MagmaUpper && uplo != MagmaLower) { arginfo = -1; } else if (n < 0) { arginfo = -2; } else if (ldda < max(1,n)) { arginfo = -4; } if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } // Quick return if possible if (n == 0) { return arginfo; } if( n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t j, k, ib; magma_int_t nb = POTRF_NB; magma_int_t gemm_crossover = 127;//nb > 32 ? 127 : 160; #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magmaDoubleComplex **dA_displ = NULL; magmaDoubleComplex **dW0_displ = NULL; magmaDoubleComplex **dW1_displ = NULL; magmaDoubleComplex **dW2_displ = NULL; magmaDoubleComplex **dW3_displ = NULL; magmaDoubleComplex **dW4_displ = NULL; magmaDoubleComplex **dinvA_array = NULL; magmaDoubleComplex **dwork_array = NULL; magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magmaDoubleComplex* dinvA = NULL; magmaDoubleComplex* dwork = NULL;// dinvA and dwork are workspace in ztrsm magmaDoubleComplex **cpuAarray = NULL; magma_zmalloc( &dinvA, invA_msize * batchCount); magma_zmalloc( &dwork, dwork_msize * batchCount ); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL ) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_zlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dinvA, invA_msize, queue); magmablas_zlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dwork, dwork_msize, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(k=0; k<nbstreams; k++){ magma_queue_create( &stream[k] ); } magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1); magmablasSetKernelStream(NULL); if (uplo == MagmaUpper) { printf("Upper side is unavailable \n"); goto fin; } else { for(j = 0; j < n; j+=nb) { ib = min(nb, n-j); #if 1 //=============================================== // panel factorization //=============================================== magma_zdisplace_pointers(dA_displ, dA_array, ldda, j, j, batchCount, queue); zset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); zset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); #if 0 arginfo = magma_zpotrf_panel_batched( uplo, n-j, ib, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle); #else //arginfo = magma_zpotrf_rectile_batched( arginfo = magma_zpotrf_recpanel_batched( uplo, n-j, ib, 32, dA_displ, ldda, dwork_array, dwork_msize, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, j, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #endif #if 1 //real_Double_t gpu_time; //gpu_time = magma_sync_wtime(NULL); if( (n-j-ib) > 0){ if( (n-j-ib) > gemm_crossover) { //------------------------------------------- // USE STREAM HERK //------------------------------------------- // since it use different stream I need to wait the panel. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); /* you must know the matrix layout inorder to do it */ for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // call herk, class zherk must call cpu pointer magma_zherk(MagmaLower, MagmaNoTrans, n-j-ib, ib, d_alpha, (const magmaDoubleComplex*) cpuAarray[k] + j+ib+j*ldda, ldda, d_beta, cpuAarray[k] + j+ib+(j+ib)*ldda, ldda); } // need to synchronise to be sure that panel do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } else { //------------------------------------------- // USE BATCHED GEMM(which is a HERK in fact, since it only access the lower part) //------------------------------------------- magma_zdisplace_pointers(dA_displ, dA_array, ldda, j+ib, j, batchCount, queue); magma_zdisplace_pointers(dW1_displ, dA_array, ldda, j+ib, j+ib, batchCount, queue); magmablas_zherk_batched(uplo, MagmaNoTrans, n-j-ib, ib, d_alpha, dA_displ, ldda, d_beta, dW1_displ, ldda, batchCount, queue); } } //gpu_time = magma_sync_wtime(NULL) - gpu_time; //real_Double_t flops = (n-j-ib) * (n-j-ib) * ib / 1e9 * batchCount; //real_Double_t gpu_perf = flops / gpu_time; //printf("Rows= %d, Colum=%d, herk time = %7.2fms, Gflops= %7.2f\n", n-j-ib, ib, gpu_time*1000, gpu_perf); #endif } } fin: magma_queue_sync(NULL); for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); return arginfo; }
/** Purpose ------- DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. The factorization has the form A = P * L * U where P is a permutation matrix, L is lower triangular with unit diagonal elements (lower trapezoidal if m > n), and U is upper triangular (upper trapezoidal if m < n). This is the right-looking Level 3 BLAS version of the algorithm. If the current stream is NULL, this version replaces it with a new stream to overlap computation with communication. Arguments --------- @param[in] m INTEGER The number of rows of the matrix A. M >= 0. @param[in] n INTEGER The number of columns of the matrix A. N >= 0. @param[in,out] dA DOUBLE_PRECISION array on the GPU, dimension (LDDA,N). On entry, the M-by-N matrix to be factored. On exit, the factors L and U from the factorization A = P*L*U; the unit diagonal elements of L are not stored. @param[in] ldda INTEGER The leading dimension of the array A. LDDA >= max(1,M). @param[out] ipiv INTEGER array, dimension (min(M,N)) The pivot indices; for 1 <= i <= min(M,N), row i of the matrix was interchanged with row IPIV(i). @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value or another error occured, such as memory allocation failed. - > 0: if INFO = i, U(i,i) is exactly zero. The factorization has been completed, but the factor U is exactly singular, and division by zero will occur if it is used to solve a system of equations. @ingroup magma_dgesv_comp ********************************************************************/ extern "C" magma_int_t magma_dgetrf_batched( magma_int_t m, magma_int_t n, double **dA_array, magma_int_t ldda, magma_int_t **ipiv_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define A(i_, j_) (A + (i_) + (j_)*ldda) magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } //#define ENABLE_TIMER3 #if defined(ENABLE_TIMER3) real_Double_t tall=0.0, tloop=0., talloc=0., tdalloc=0.; tall = magma_sync_wtime(0); talloc = magma_sync_wtime(0); #endif double neg_one = MAGMA_D_NEG_ONE; double one = MAGMA_D_ONE; magma_int_t ib, i, k, pm; magma_int_t nb = BATRF_NB; magma_int_t gemm_crossover = nb > 32 ? 127 : 160; // magma_int_t gemm_crossover = n;// use only stream gemm #if defined(USE_CUOPT) cublasHandle_t myhandle; cublasCreate_v2(&myhandle); #else cublasHandle_t myhandle=NULL; #endif magma_int_t **dipiv_displ = NULL; double **dA_displ = NULL; double **dW0_displ = NULL; double **dW1_displ = NULL; double **dW2_displ = NULL; double **dW3_displ = NULL; double **dW4_displ = NULL; double **dinvA_array = NULL; double **dwork_array = NULL; magma_malloc((void**)&dipiv_displ, batchCount * sizeof(*dipiv_displ)); magma_malloc((void**)&dA_displ, batchCount * sizeof(*dA_displ)); magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); magma_malloc((void**)&dinvA_array, batchCount * sizeof(*dinvA_array)); magma_malloc((void**)&dwork_array, batchCount * sizeof(*dwork_array)); magma_int_t invA_msize = ((n+TRI_NB-1)/TRI_NB)*TRI_NB*TRI_NB; magma_int_t dwork_msize = n*nb; magma_int_t **pivinfo_array = NULL; magma_int_t *pivinfo = NULL; double* dinvA = NULL; double* dwork = NULL;// dinvA and dwork are workspace in dtrsm double **cpuAarray = NULL; magma_dmalloc( &dinvA, invA_msize * batchCount); magma_dmalloc( &dwork, dwork_msize * batchCount ); magma_malloc((void**)&pivinfo_array, batchCount * sizeof(*pivinfo_array)); magma_malloc((void**)&pivinfo, batchCount * m * sizeof(magma_int_t)); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(double*)); /* check allocation */ if ( dA_displ == NULL || dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dinvA_array == NULL || dwork_array == NULL || dinvA == NULL || dwork == NULL || cpuAarray == NULL || dipiv_displ == NULL || pivinfo_array == NULL || pivinfo == NULL) { magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_dlaset_q(MagmaFull, invA_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dinvA, invA_msize, queue); magmablas_dlaset_q(MagmaFull, dwork_msize, batchCount, MAGMA_D_ZERO, MAGMA_D_ZERO, dwork, dwork_msize, queue); dset_pointer(dwork_array, dwork, 1, 0, 0, dwork_msize, batchCount, queue); dset_pointer(dinvA_array, dinvA, TRI_NB, 0, 0, invA_msize, batchCount, queue); set_ipointer(pivinfo_array, pivinfo, 1, 0, 0, m, batchCount, queue); // printf(" I am in dgetrfbatched\n"); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(double*), dA_array, 1, cpuAarray, 1); #if defined(ENABLE_TIMER3) printf(" I am after malloc\n"); talloc = magma_sync_wtime(0) - talloc; tloop = magma_sync_wtime(0); #endif for(i = 0; i < min_mn; i+=nb) { magmablasSetKernelStream(NULL); ib = min(nb, min_mn-i); pm = m-i; magma_idisplace_pointers(dipiv_displ, ipiv_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); //=============================================== // panel factorization //=============================================== #if 0 arginfo = magma_dgetf2_batched( pm, ib, dA_displ, ldda, dW1_displ, dW2_displ, dW3_displ, dipiv_displ, info_array, i, batchCount, myhandle); #else arginfo = magma_dgetrf_recpanel_batched( pm, ib, 16, dA_displ, ldda, dipiv_displ, pivinfo_array, dwork_array, nb, dinvA_array, invA_msize, dW0_displ, dW1_displ, dW2_displ, dW3_displ, dW4_displ, info_array, i, batchCount, myhandle, queue); #endif if(arginfo != 0 ) goto fin; //=============================================== // end of panel //=============================================== #define RUN_ALL #ifdef RUN_ALL // setup pivinfo before adjusting ipiv setup_pivinfo_batched(pivinfo_array, dipiv_displ, pm, ib, batchCount, queue); adjust_ipiv_batched(dipiv_displ, ib, i, batchCount, queue); // stepinit_ipiv(pivinfo_array, pm, batchCount);// for debug and check swap, it create an ipiv #if 0 dlaswp_batched( i, dA_displ, ldda, i, i+ib, dipiv_displ, pivinfo_array, batchCount); #else magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, 0, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, 0, batchCount, queue); magma_dlaswp_rowparallel_batched( i, dA_displ, ldda, dW0_displ, ldda, i, i+ib, pivinfo_array, batchCount, queue); #endif if( (i + ib) < n) { // swap right side and trsm magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i+ib, batchCount, queue); dset_pointer(dwork_array, dwork, nb, 0, 0, dwork_msize, batchCount, queue); // I don't think it is needed Azzam magma_dlaswp_rowparallel_batched( n-(i+ib), dA_displ, ldda, dwork_array, nb, i, i+ib, pivinfo_array, batchCount, queue); magma_ddisplace_pointers(dA_displ, dA_array, ldda, i, i, batchCount, queue); magma_ddisplace_pointers(dW0_displ, dA_array, ldda, i, i+ib, batchCount, queue); magmablas_dtrsm_outofplace_batched(MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit, 1, ib, n-i-ib, MAGMA_D_ONE, dA_displ, ldda, // dA dwork_array, nb, // dB dW0_displ, ldda, // dX dinvA_array, invA_msize, dW1_displ, dW2_displ, dW3_displ, dW4_displ, 0, batchCount, queue); if( (i + ib) < m) { // if gemm size is >160 use a streamed classical cublas gemm since it is faster // the batched is faster only when M=N<=160 for K40c //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i-ib) > gemm_crossover && (n-i-ib) > gemm_crossover) { //printf("caling streamed dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); // since it use different stream I need to wait the TRSM and swap. // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_queue_sync(NULL); // for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); magma_dgemm(MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, cpuAarray[k] + (i+ib)+i*ldda, ldda, cpuAarray[k] + i+(i+ib)*ldda, ldda, one, cpuAarray[k] + (i+ib)+(i+ib)*ldda, ldda); } // need to synchronise to be sure that dgetf2 do not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { magma_ddisplace_pointers(dA_displ, dA_array, ldda, i+ib, i, batchCount, queue); magma_ddisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_ddisplace_pointers(dW2_displ, dA_array, ldda, i+ib, i+ib, batchCount, queue); //printf("caling batched dgemm %d %d %d \n", m-i-ib, n-i-ib, ib); magmablas_dgemm_batched( MagmaNoTrans, MagmaNoTrans, m-i-ib, n-i-ib, ib, neg_one, dA_displ, ldda, dW1_displ, ldda, one, dW2_displ, ldda, batchCount, queue); } // end of batched/stream gemm } // end of if( (i + ib) < m) } // end of if( (i + ib) < n) #endif }// end of for fin: magma_queue_sync(NULL); #if defined(ENABLE_TIMER3) tloop = magma_sync_wtime(0) - tloop; tdalloc = magma_sync_wtime(0); #endif for(i=0; i<nbstreams; i++){ magma_queue_destroy( stream[i] ); } magmablasSetKernelStream(cstream); #if defined(USE_CUOPT) cublasDestroy_v2(myhandle); #endif magma_free(dA_displ); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dinvA_array); magma_free(dwork_array); magma_free( dinvA ); magma_free( dwork ); free(cpuAarray); magma_free(dipiv_displ); magma_free(pivinfo_array); magma_free(pivinfo); #if defined(ENABLE_TIMER3) tdalloc = magma_sync_wtime(0) - tdalloc; tall = magma_sync_wtime(0) - tall; printf("here is the timing from inside dgetrf_batched talloc: %10.5f tloop: %10.5f tdalloc: %10.5f tall: %10.5f sum: %10.5f\n", talloc, tloop, tdalloc, tall, talloc+tloop+tdalloc ); #endif return arginfo; }