/* //////////////////////////////////////////////////////////////////////////// -- Testing clacpy_batched Code is very similar to testing_cgeadd_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B; magmaFloatComplex_ptr d_A, d_B; magmaFloatComplex **hAarray, **hBarray, **dAarray, **dBarray; magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_int_t status = 0; magma_opts opts( MagmaOptsBatched ); opts.parse_opts( argc, argv ); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("%% mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf("%% M N ntile CPU Gflop/s (ms) GPU Gflop/s (ms) check\n"); printf("%%================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; lda = M; ldda = magma_roundup( M, opts.align ); // multiple of 32 by default size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gbytes = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); TESTING_MALLOC_CPU( hAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_CPU( hBarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dBarray, magmaFloatComplex*, ntile ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda, opts.queue ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda, opts.queue ); // setup pointers for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(magmaFloatComplex*), hAarray, 1, dAarray, 1, opts.queue ); magma_setvector( ntile, sizeof(magmaFloatComplex*), hBarray, 1, dBarray, 1, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ); magmablas_clacpy_batched( MagmaFull, mb, nb, dAarray, ldda, dBarray, ldda, ntile, opts.queue ); gpu_time = magma_sync_wtime( opts.queue ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( magma_int_t tile = 0; tile < ntile; ++tile ) { magma_int_t offset = tile*mstride + tile*nstride*lda; lapackf77_clacpy( MagmaFullStr, &mb, &nb, &h_A[offset], &lda, &h_B[offset], &lda ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda, opts.queue ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_clange("f", &M, &N, h_B, &lda, work); bool okay = (error == 0); status += ! okay; printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time*1000., gpu_perf, gpu_time*1000., (okay ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } opts.cleanup(); TESTING_FINALIZE(); return status; }
extern "C" magma_int_t magma_cgeqrf_batched( magma_int_t m, magma_int_t n, magmaFloatComplex **dA_array, magma_int_t ldda, magmaFloatComplex **tau_array, magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue) { #define dA(i, j) (dA + (i) + (j)*ldda) // A(i, j) means at i row, j column magma_int_t min_mn = min(m, n); cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t)); /* Check arguments */ magma_int_t arginfo = 0; if (m < 0) arginfo = -1; else if (n < 0) arginfo = -2; else if (ldda < max(1,m)) arginfo = -4; if (arginfo != 0) { magma_xerbla( __func__, -(arginfo) ); return arginfo; } /* Quick return if possible */ if (m == 0 || n == 0) if(min_mn == 0 ) return arginfo; if( m > 2048 || n > 2048 ){ printf("=========================================================================================\n"); printf(" WARNING batched routines are designed for small sizes it might be better to use the\n Native/Hybrid classical routines if you want performance\n"); printf("=========================================================================================\n"); } magma_int_t nb = 32; magma_int_t nnb = 8; magma_int_t i, k, ib=nb, jb=nnb; magma_int_t ldw, ldt, ldr, offset; cublasHandle_t myhandle; cublasCreate_v2(&myhandle); magmaFloatComplex **dW0_displ = NULL; magmaFloatComplex **dW1_displ = NULL; magmaFloatComplex **dW2_displ = NULL; magmaFloatComplex **dW3_displ = NULL; magmaFloatComplex **dW4_displ = NULL; magmaFloatComplex **dW5_displ = NULL; magmaFloatComplex *dwork = NULL; magmaFloatComplex *dT = NULL; magmaFloatComplex *dR = NULL; magmaFloatComplex **dR_array = NULL; magmaFloatComplex **dT_array = NULL; magmaFloatComplex **cpuAarray = NULL; magmaFloatComplex **cpuTarray = NULL; magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ)); magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ)); // used in clarfb magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ)); magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array)); magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array)); ldt = ldr = min(nb, min_mn); magma_cmalloc(&dwork, (2 * nb * n) * batchCount); magma_cmalloc(&dR, ldr * n * batchCount); magma_cmalloc(&dT, ldt * ldt * batchCount); magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaFloatComplex*)); magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(magmaFloatComplex*)); /* check allocation */ if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL || dW4_displ == NULL || dW5_displ == NULL || dR_array == NULL || dT_array == NULL || dR == NULL || dT == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) { magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); magma_int_t info = MAGMA_ERR_DEVICE_ALLOC; magma_xerbla( __func__, -(info) ); return info; } magmablas_claset_q(MagmaFull, ldr, n*batchCount , MAGMA_C_ZERO, MAGMA_C_ZERO, dR, ldr, queue); magmablas_claset_q(MagmaFull, ldt, ldt*batchCount, MAGMA_C_ZERO, MAGMA_C_ZERO, dT, ldt, queue); cset_pointer(dR_array, dR, 1, 0, 0, ldr*min(nb, min_mn), batchCount, queue); cset_pointer(dT_array, dT, 1, 0, 0, ldt*min(nb, min_mn), batchCount, queue); magma_queue_t cstream; magmablasGetKernelStream(&cstream); magma_int_t streamid; const magma_int_t nbstreams=32; magma_queue_t stream[nbstreams]; for(i=0; i<nbstreams; i++){ magma_queue_create( &stream[i] ); } magma_getvector( batchCount, sizeof(magmaFloatComplex*), dA_array, 1, cpuAarray, 1); magma_getvector( batchCount, sizeof(magmaFloatComplex*), dT_array, 1, cpuTarray, 1); magmablasSetKernelStream(NULL); for(i=0; i<min_mn;i+=nb) { ib = min(nb, min_mn-i); //=============================================== // panel factorization //=============================================== magma_cdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); magma_cdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); //dwork is used in panel factorization and trailing matrix update //dW4_displ, dW5_displ are used as workspace and configured inside magma_cgeqrf_panel_batched(m-i, ib, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, dR_array, ldr, dW1_displ, dW3_displ, dwork, dW4_displ, dW5_displ, info_array, batchCount, myhandle, queue); //=============================================== // end of panel //=============================================== //direct panel matrix V in dW0_displ, magma_cdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue); // copy the upper part of V into dR cgeqrf_copy_upper_batched(ib, jb, dW0_displ, ldda, dR_array, ldr, batchCount, queue); //=============================================== // update trailing matrix //=============================================== //dwork is used in panel factorization and trailing matrix update //reset dW4_displ ldw = nb; cset_pointer(dW4_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); offset = ldw*n*batchCount; cset_pointer(dW5_displ, dwork + offset, 1, 0, 0, ldw*n, batchCount, queue ); if( (n-ib-i) > 0) { // set the diagonal of v as one and the upper triangular part as zero magmablas_claset_batched(MagmaUpper, ib, ib, MAGMA_C_ZERO, MAGMA_C_ONE, dW0_displ, ldda, batchCount, queue); magma_cdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue); // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation magma_clarft_batched(m-i, ib, 0, dW0_displ, ldda, dW2_displ, dT_array, ldt, dW4_displ, nb*ldt, batchCount, myhandle, queue); // perform C = (I-V T^H V^H) * C, C is the trailing matrix //------------------------------------------- // USE STREAM GEMM //------------------------------------------- if( (m-i) > 100 && (n-i-ib) > 100) { // But since the code use the NULL stream everywhere, // so I don't need it, because the NULL stream do the sync by itself //magma_device_sync(); for(k=0; k<batchCount; k++) { streamid = k%nbstreams; magmablasSetKernelStream(stream[streamid]); // the stream gemm must take cpu pointer magma_clarfb_gpu_gemm(MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, cpuAarray[k] + i + i * ldda, ldda, cpuTarray[k], ldt, cpuAarray[k] + i + (i+ib) * ldda, ldda, dwork + nb * n * k, -1, dwork + nb * n * batchCount + nb * n * k, -1); } // need to synchronise to be sure that panel does not start before // finishing the update at least of the next panel // BUT no need for it as soon as the other portion of the code // use the NULL stream which do the sync by itself //magma_device_sync(); magmablasSetKernelStream(NULL); } //------------------------------------------- // USE BATCHED GEMM //------------------------------------------- else { //direct trailing matrix in dW1_displ magma_cdisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue); magma_clarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-i, n-i-ib, ib, (const magmaFloatComplex**)dW0_displ, ldda, (const magmaFloatComplex**)dT_array, ldt, dW1_displ, ldda, dW4_displ, ldw, dW5_displ, ldw, batchCount, myhandle, queue); } }// update the trailing matrix //=============================================== // copy dR back to V after the trailing matrix update magmablas_clacpy_batched(MagmaUpper, ib, ib, dR_array, ldr, dW0_displ, ldda, batchCount, queue); } for(k=0; k<nbstreams; k++){ magma_queue_destroy( stream[k] ); } magmablasSetKernelStream(cstream); cublasDestroy_v2(myhandle); magma_free(dW0_displ); magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dW4_displ); magma_free(dW5_displ); magma_free(dR_array); magma_free(dT_array); magma_free(dR); magma_free(dT); magma_free(dwork); free(cpuAarray); free(cpuTarray); return arginfo; }
extern "C" magma_int_t magma_cgeqrf_panel_batched( magma_int_t m, magma_int_t n, magma_int_t nb, magmaFloatComplex** dA_array, magma_int_t ldda, magmaFloatComplex** tau_array, magmaFloatComplex** dT_array, magma_int_t ldt, magmaFloatComplex** dR_array, magma_int_t ldr, magmaFloatComplex** dW0_displ, magmaFloatComplex** dW1_displ, magmaFloatComplex *dwork, magmaFloatComplex** dW2_displ, magmaFloatComplex** dW3_displ, magma_int_t *info_array, magma_int_t batchCount, cublasHandle_t myhandle, magma_queue_t queue) { magma_int_t j, jb; magma_int_t ldw = nb; for( j=0; j<n; j+=nb) { jb = min(nb, n-j); magma_cdisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); magma_cdisplace_pointers(dW2_displ, tau_array, 1, j, 0, batchCount, queue); magma_cdisplace_pointers(dW3_displ, dR_array, ldr, j, j, batchCount, queue); // //sub-panel factorization magma_cgeqr2_batched( m-j, jb, dW0_displ, ldda, dW2_displ, info_array, batchCount, queue); //copy upper part of dA to dR magma_cdisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); magma_cdisplace_pointers(dW3_displ, dR_array, ldr, j, j, batchCount, queue); magmablas_clacpy_batched(MagmaUpper, jb, jb, dW0_displ, ldda, dW3_displ, ldr, batchCount, queue); magma_cdisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); magma_cdisplace_pointers(dW3_displ, dR_array, ldr, j, j, batchCount, queue); magmablas_claset_batched(MagmaUpper, jb, jb, MAGMA_C_ZERO, MAGMA_C_ONE, dW0_displ, ldda, batchCount, queue); if( (n-j-jb) > 0) //update the trailing matrix inside the panel { magma_clarft_sm32x32_batched(m-j, jb, dW0_displ, ldda, dW2_displ, dT_array, ldt, batchCount, myhandle, queue); magma_cdisplace_pointers(dW1_displ, dA_array, ldda, j, j + jb, batchCount, queue); cset_pointer(dW2_displ, dwork, 1, 0, 0, ldw*n, batchCount, queue ); cset_pointer(dW3_displ, dwork + ldw*n*batchCount, 1, 0, 0, ldw*n, batchCount, queue ); magma_clarfb_gemm_batched( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise, m-j, n-j-jb, jb, (const magmaFloatComplex**)dW0_displ, ldda, (const magmaFloatComplex**)dT_array, ldt, dW1_displ, ldda, dW2_displ, ldw, dW3_displ, ldw, batchCount, myhandle, queue); } } return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing clacpy_batched Code is very similar to testing_cgeadd_batched.cpp */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gbytes, gpu_perf, gpu_time, cpu_perf, cpu_time; float error, work[1]; magmaFloatComplex c_neg_one = MAGMA_C_NEG_ONE; magmaFloatComplex *h_A, *h_B; magmaFloatComplex *d_A, *d_B; magmaFloatComplex **hAarray, **hBarray, **dAarray, **dBarray; magma_int_t M, N, mb, nb, size, lda, ldda, mstride, nstride, ntile; magma_int_t ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); mb = (opts.nb == 0 ? 32 : opts.nb); nb = (opts.nb == 0 ? 64 : opts.nb); mstride = 2*mb; nstride = 3*nb; printf("mb=%d, nb=%d, mstride=%d, nstride=%d\n", (int) mb, (int) nb, (int) mstride, (int) nstride ); printf(" M N ntile CPU GFlop/s (sec) GPU GFlop/s (sec) check\n"); printf("=================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; lda = M; ldda = ((M+31)/32)*32; size = lda*N; if ( N < nb || M < nb ) { ntile = 0; } else { ntile = min( (M - nb)/mstride + 1, (N - nb)/nstride + 1 ); } gbytes = 2.*mb*nb*ntile / 1e9; TESTING_MALLOC_CPU( h_A, magmaFloatComplex, lda *N ); TESTING_MALLOC_CPU( h_B, magmaFloatComplex, lda *N ); TESTING_MALLOC_DEV( d_A, magmaFloatComplex, ldda*N ); TESTING_MALLOC_DEV( d_B, magmaFloatComplex, ldda*N ); TESTING_MALLOC_CPU( hAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_CPU( hBarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dAarray, magmaFloatComplex*, ntile ); TESTING_MALLOC_DEV( dBarray, magmaFloatComplex*, ntile ); lapackf77_clarnv( &ione, ISEED, &size, h_A ); lapackf77_clarnv( &ione, ISEED, &size, h_B ); /* ==================================================================== Performs operation using MAGMA =================================================================== */ magma_csetmatrix( M, N, h_A, lda, d_A, ldda ); magma_csetmatrix( M, N, h_B, lda, d_B, ldda ); // setup pointers for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*ldda; hAarray[tile] = &d_A[offset]; hBarray[tile] = &d_B[offset]; } magma_setvector( ntile, sizeof(magmaFloatComplex*), hAarray, 1, dAarray, 1 ); magma_setvector( ntile, sizeof(magmaFloatComplex*), hBarray, 1, dBarray, 1 ); gpu_time = magma_sync_wtime( 0 ); magmablas_clacpy_batched( MagmaUpperLower, mb, nb, dAarray, ldda, dBarray, ldda, ntile ); gpu_time = magma_sync_wtime( 0 ) - gpu_time; gpu_perf = gbytes / gpu_time; /* ===================================================================== Performs operation using LAPACK =================================================================== */ cpu_time = magma_wtime(); for( int tile = 0; tile < ntile; ++tile ) { int offset = tile*mstride + tile*nstride*lda; lapackf77_clacpy( MagmaUpperLowerStr, &mb, &nb, &h_A[offset], &lda, &h_B[offset], &lda ); } cpu_time = magma_wtime() - cpu_time; cpu_perf = gbytes / cpu_time; /* ===================================================================== Check the result =================================================================== */ magma_cgetmatrix( M, N, d_B, ldda, h_A, lda ); blasf77_caxpy(&size, &c_neg_one, h_A, &ione, h_B, &ione); error = lapackf77_clange("f", &M, &N, h_B, &lda, work); printf("%5d %5d %5d %7.2f (%7.2f) %7.2f (%7.2f) %s\n", (int) M, (int) N, (int) ntile, cpu_perf, cpu_time, gpu_perf, gpu_time, (error == 0. ? "ok" : "failed") ); TESTING_FREE_CPU( h_A ); TESTING_FREE_CPU( h_B ); TESTING_FREE_DEV( d_A ); TESTING_FREE_DEV( d_B ); TESTING_FREE_CPU( hAarray ); TESTING_FREE_CPU( hBarray ); TESTING_FREE_DEV( dAarray ); TESTING_FREE_DEV( dBarray ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return 0; }