void magma_dswap( magma_int_t n, double *dx, magma_int_t incx, double *dy, magma_int_t incy ) { cublasDswap( n, dx, incx, dy, incy ); }
void d_swap(SEXP rx, SEXP rincx, SEXP ry, SEXP rincy) { int nx, ny, n, incx = asInteger(rincx), incy = asInteger(rincy); double * x, * y; unpackVector(rx, &nx, &x); unpackVector(ry, &ny, &y); n = imin2(nx, ny); cublasDswap(n, x, incx, y, incy); checkCublasError("d_swap"); }
void swap(Vector<double> &x, Vector<double> &y) { assert(x.getSize() == y.getSize()); cublasDswap(x.getSize(), x, x.inc(), y, y.inc()); }
int CORE_dtstrf_cublas(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; cublasStatus_t status; cudaError_t err; double alpha; int i, j, ii, sb; int im, ip; #if CONFIG_VERBOSE fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n", __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK); fflush(stdout); #endif /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ err = cudaMemset(L, 0, LDL*N*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* dev_ptr = 0; err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); double* host_ptr; err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double)); PLASMA_CUDA_ASSERT(err); int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { status = cublasIdamax(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &im ); PLASMA_CUBLAS_ASSERT(status); /* get im */ err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); /* ajust index, CUBLAS is 1-based indexing */ im--; piv[ip] = ii+i+1; core_dtstrf_cmp(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if (host_ptr[0] == 1.0f) { /* * Swap behind. */ status = cublasDswap(kaapi_cuda_cublas_handle(), i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); PLASMA_CUBLAS_ASSERT(status); /* * Swap ahead. */ status = cublasDswap(kaapi_cuda_cublas_handle(), sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); PLASMA_CUBLAS_ASSERT(status); /* * Set IPIV. */ piv[ip] = NB + im + 1; core_dtstrf_set_zero(kaapi_cuda_kernel_stream(), A, LDA, i, ii, im, zzero ); } core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(), &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); if ((*INFO == 0) && (host_ptr[0] == 1.0f)) { *INFO = ii+i+1; } // alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); alpha = host_ptr[1]; status = cublasDscal(kaapi_cuda_cublas_handle(), M, &alpha, &A[LDA*(ii+i)], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDcopy(kaapi_cuda_cublas_handle(), M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1 ); PLASMA_CUBLAS_ASSERT(status); status = cublasDger(kaapi_cuda_cublas_handle(), M, sb-i-1, &mzone, &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); PLASMA_CUBLAS_ASSERT(status); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] - ii; } } CORE_dssssm_cublas_v2( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &piv[ii] ); err = cudaStreamSynchronize(kaapi_cuda_kernel_stream()); PLASMA_CUDA_ASSERT(err); for(j = ii; j < ii+sb; j++) { if (piv[j] <= NB) { piv[j] = piv[j] + ii; } } } } cudaFreeHost(host_ptr); cudaFree(dev_ptr); return PLASMA_SUCCESS; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); double *h_A1, *h_A2; double *d_A1, *d_A2; double *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasDswap dswap dswapblk dlaswp dpermute dlaswp2 dlaswpx dcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk dlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(double) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_R1, double, lda*N ); TESTING_MALLOC_PIN( h_R2, double, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, double, ldda*N ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasDswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // dpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
// // Overloaded function for dispatching to // * CUBLAS backend, and // * double value-type. // inline void swap( const int n, double* x, const int incx, double* y, const int incy ) { cublasDswap( n, x, incx, y, incy ); }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // At least 2 arguments expected // Input and result if (nrhs!=6) mexErrMsgTxt("Wrong number of arguments"); if (init == 0) { // Initialize function // mexLock(); // load GPUmat gm = gmGetGPUmat(); init = 1; } /* mex parameters are: 0 Source array [X or one of the y-edge-zeroed copies of it] 1 Destination array [Accumulator, basically] 2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz) 3 Stack array 2 4 Shift directions 5 Coefficient on shift */ // Get GPU array pointers GPUtype srcArray = gm->gputype.getGPUtype(prhs[0]); GPUtype dstArray = gm->gputype.getGPUtype(prhs[1]); GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]); //GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]); GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]); // Get some control variables sorted out double *shiftdirs = mxGetPr(prhs[4]); const int *dims = gm->gputype.getSize(srcArray); double alpha = *mxGetPr(prhs[5]); int shifts[3]; shifts[0] = (int)shiftdirs[0]; shifts[1] = (int)shiftdirs[1]; shifts[2] = (int)shiftdirs[2]; double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray); // Remove appropriate YZ plane if any double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); // Remove appropriate XZ plane if any //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Remove appropriate XY plane if any cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // Decide the amount of offset to acheive desired shift int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2]; int Ntot = dims[0] * dims[1] * dims[2]; cubDst = (double*)gm->gputype.getGPUptr(dstArray); if(theta >= 0) { cublasDaxpy(Ntot-theta, alpha, cubSrc, 1, cubDst + theta, 1); } else { cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst, 1); } // Replace the XY plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ); if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc, 1, cubDst, 1); if(shifts[2] == 1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1); // replace the XZ plane if it was removed //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts); // Replace the YZ plane if it was removed cubDst = (double*)gm->gputype.getGPUptr(stackArrayX); if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc, dims[0], cubDst, 1); if(shifts[0] == 1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1); }