Esempio n. 1
0
void magma_dswap(
    magma_int_t n,
    double *dx, magma_int_t incx,
    double *dy, magma_int_t incy )
{
    cublasDswap( n, dx, incx, dy, incy );
}
Esempio n. 2
0
File: ardblas.c Progetto: rforge/gcb
void d_swap(SEXP rx, SEXP rincx, SEXP ry, SEXP rincy)
{
	int
		nx, ny, n,
		incx = asInteger(rincx),
		incy = asInteger(rincy);
	double
		* x, * y;

	unpackVector(rx, &nx, &x);
	unpackVector(ry, &ny, &y);
	n = imin2(nx, ny);

	cublasDswap(n, x, incx, y, incy);
	checkCublasError("d_swap");
}
void swap(Vector<double> &x, Vector<double> &y)
{
  assert(x.getSize() == y.getSize()); 
  cublasDswap(x.getSize(), x, x.inc(), y, y.inc());
}
Esempio n. 4
0
int CORE_dtstrf_cublas(int M, int N, int IB, int NB,
                double *U, int LDU,
                double *A, int LDA,
                double *L, int LDL,
                int *IPIV,
                double *WORK, int LDWORK,
                int *INFO)
{
  static double zzero = 0.0;
  static double mzone =-1.0;
  cublasStatus_t status;
  cudaError_t err;
  
  double alpha;
  int i, j, ii, sb;
  int im, ip;
  
#if CONFIG_VERBOSE
  fprintf(stdout, "%s: M=%d N=%d IB=%d NB=%d U=%p LDU=%d A=%p LDA=%d L=%p LDL=%d IPIV=%p WORK=%p LDWORK=%d\n",
          __FUNCTION__, M, N, IB, NB, U, LDU, A, LDA, L, LDL, IPIV, WORK, LDWORK);
  fflush(stdout);
#endif
  
  /* Check input arguments */
  *INFO = 0;
  if (M < 0) {
    coreblas_error(1, "Illegal value of M");
    return -1;
  }
  if (N < 0) {
    coreblas_error(2, "Illegal value of N");
    return -2;
  }
  if (IB < 0) {
    coreblas_error(3, "Illegal value of IB");
    return -3;
  }
  if ((LDU < max(1,NB)) && (NB > 0)) {
    coreblas_error(6, "Illegal value of LDU");
    return -6;
  }
  if ((LDA < max(1,M)) && (M > 0)) {
    coreblas_error(8, "Illegal value of LDA");
    return -8;
  }
  if ((LDL < max(1,IB)) && (IB > 0)) {
    coreblas_error(10, "Illegal value of LDL");
    return -10;
  }
  
  /* Quick return */
  if ((M == 0) || (N == 0) || (IB == 0))
    return PLASMA_SUCCESS;
  
  /* Set L to 0 */
  err = cudaMemset(L, 0, LDL*N*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  double* dev_ptr = 0;
  err = cudaMalloc((void**)&dev_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  double* host_ptr;
  err = cudaMallocHost((void**)&host_ptr, 2*sizeof(double));
  PLASMA_CUDA_ASSERT(err);
  
  int* piv = kaapi_memory_get_host_pointer_and_validate(IPIV);
  
  ip = 0;
  for (ii = 0; ii < N; ii += IB) {
    sb = min(N-ii, IB);
    
    for (i = 0; i < sb; i++) {
      status = cublasIdamax(kaapi_cuda_cublas_handle(),
                            M, &A[LDA*(ii+i)], 1, &im
                            );
      PLASMA_CUBLAS_ASSERT(status);
      
      /* get im */
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      /* ajust index, CUBLAS is 1-based indexing */
      im--;

      piv[ip] = ii+i+1;
      
      core_dtstrf_cmp(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if (host_ptr[0] == 1.0f) {
        /*
         * Swap behind.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
                   i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK
        );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Swap ahead.
         */
        status = cublasDswap(kaapi_cuda_cublas_handle(),
              sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA
         );
        PLASMA_CUBLAS_ASSERT(status);
        /*
         * Set IPIV.
         */
        piv[ip] = NB + im + 1;

        core_dtstrf_set_zero(kaapi_cuda_kernel_stream(),
                             A, LDA, i, ii, im, zzero
                        );
      }
      
      core_dtstrf_cmp_zzero_and_get_alpha(kaapi_cuda_kernel_stream(),
                      &A[LDA*(ii+i)+im], &U[LDU*(ii+i)+ii+i], zzero, dev_ptr, host_ptr);
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      if ((*INFO == 0) && (host_ptr[0] == 1.0f)) {
        *INFO = ii+i+1;
      }
      
//      alpha = ((double)1. / U[LDU*(ii+i)+ii+i]);
      alpha = host_ptr[1];
      status = cublasDscal(kaapi_cuda_cublas_handle(),
                           M, &alpha, &A[LDA*(ii+i)], 1
                           );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDcopy(kaapi_cuda_cublas_handle(),
                  M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1
        );
      PLASMA_CUBLAS_ASSERT(status);
      
      status = cublasDger(kaapi_cuda_cublas_handle(),
                          M, sb-i-1,
                          &mzone, &A[LDA*(ii+i)], 1,
                          &U[LDU*(ii+i+1)+ii+i], LDU,
                          &A[LDA*(ii+i+1)], LDA
      );
      PLASMA_CUBLAS_ASSERT(status);
      ip = ip+1;
    }
    /*
     * Apply the subpanel to the rest of the panel.
     */
    if(ii+i < N) {
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] - ii;
        }
      }
      
      CORE_dssssm_cublas_v2(
                  NB, N-(ii+sb), M, N-(ii+sb), sb, sb,
                  &U[LDU*(ii+sb)+ii], LDU,
                  &A[LDA*(ii+sb)], LDA,
                  &L[LDL*ii], LDL,
                  WORK, LDWORK, &piv[ii]
                  );
      err = cudaStreamSynchronize(kaapi_cuda_kernel_stream());
      PLASMA_CUDA_ASSERT(err);
      
      for(j = ii; j < ii+sb; j++) {
        if (piv[j] <= NB) {
          piv[j] = piv[j] + ii;
        }
      }
    }
  }
  
  cudaFreeHost(host_ptr);
  cudaFree(dev_ptr);
  return PLASMA_SUCCESS;
}
Esempio n. 5
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    double *h_A1, *h_A2;
    double *d_A1, *d_A2;
    double *h_R1, *h_R2;
    
    // row-major and column-major performance
    real_Double_t row_perf0, col_perf0;
    real_Double_t row_perf1, col_perf1;
    real_Double_t row_perf2, col_perf2;
    real_Double_t row_perf3;
    real_Double_t row_perf4;
    real_Double_t row_perf5, col_perf5;
    real_Double_t row_perf6, col_perf6;
    real_Double_t row_perf7;
    real_Double_t cpu_perf;

    real_Double_t time, gbytes;

    magma_int_t N, lda, ldda, nb, j;
    magma_int_t ione = 1;
    magma_int_t *ipiv, *ipiv2;
    magma_int_t *d_ipiv;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    magma_queue_t queue = 0;
    
    printf("            cublasDswap       dswap             dswapblk          dlaswp   dpermute dlaswp2  dlaswpx           dcopymatrix      CPU      (all in )\n");
    printf("    N   nb  row-maj/col-maj   row-maj/col-maj   row-maj/col-maj   row-maj  row-maj  row-maj  row-maj/col-maj   row-blk/col-blk  dlaswp   (GByte/s)\n");
    printf("==================================================================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            // For an N x N matrix, swap nb rows or nb columns using various methods.
            // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure.
            // The variable 'shift' keeps track of which bit is for current test
            int shift = 1;
            int check = 0;
            N = opts.nsize[itest];
            lda    = N;
            ldda   = ((N+31)/32)*32;
            nb     = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N ));
            nb     = min( N, nb );
            // each swap does 2N loads and 2N stores, for nb swaps
            gbytes = sizeof(double) * 4.*N*nb / 1e9;
                        
            TESTING_MALLOC_PIN( h_A1, double, lda*N );
            TESTING_MALLOC_PIN( h_A2, double, lda*N );
            TESTING_MALLOC_PIN( h_R1, double, lda*N );
            TESTING_MALLOC_PIN( h_R2, double, lda*N );
            
            TESTING_MALLOC_CPU( ipiv,  magma_int_t, nb );
            TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb );
            
            TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb );
            TESTING_MALLOC_DEV( d_A1, double, ldda*N );
            TESTING_MALLOC_DEV( d_A2, double, ldda*N );
            
            for( j=0; j < nb; j++ ) {
                ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1;
            }
            
            /* =====================================================================
             * cublasDswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda );
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dswapblk, blocked version (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            row_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            col_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dpermute_long (1 matrix)
             */
            
            /* Row Major */
            memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) );  // dpermute updates ipiv2
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 );
            time = magma_sync_wtime( queue ) - time;
            row_perf3 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswp (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf4 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 );
            magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 );
            time = magma_sync_wtime( queue ) - time;
            row_perf7 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf5 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* Col Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            col_perf5 = gbytes / time;
            
            time = magma_wtime();
            lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione);
            time = magma_wtime() - time;
            cpu_perf = gbytes / time;
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * Copy matrix.
             */
            
            time = magma_sync_wtime( queue );
            magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            col_perf6 = 0.5 * gbytes / time;
            
            time = magma_sync_wtime( queue );
            magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            row_perf6 = 0.5 * gbytes / time;

            printf("%5d  %3d  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f / %6.2f  %6.2f  %10s\n",
                   (int) N, (int) nb,
                   row_perf0, ((check & 0x001) != 0 ? '*' : ' '),
                   col_perf0, ((check & 0x002) != 0 ? '*' : ' '),
                   row_perf1, ((check & 0x004) != 0 ? '*' : ' '),
                   col_perf1, ((check & 0x008) != 0 ? '*' : ' '),
                   row_perf2, ((check & 0x010) != 0 ? '*' : ' '),
                   col_perf2, ((check & 0x020) != 0 ? '*' : ' '),
                   row_perf3, ((check & 0x040) != 0 ? '*' : ' '),
                   row_perf4, ((check & 0x080) != 0 ? '*' : ' '),
                   row_perf7, ((check & 0x100) != 0 ? '*' : ' '),
                   row_perf5, ((check & 0x200) != 0 ? '*' : ' '),
                   col_perf5, ((check & 0x400) != 0 ? '*' : ' '),
                   row_perf6,
                   col_perf6,
                   cpu_perf,
                   (check == 0 ? "ok" : "* failed") );
            status += ! (check == 0);
            
            TESTING_FREE_PIN( h_A1 );
            TESTING_FREE_PIN( h_A2 );
            TESTING_FREE_PIN( h_R1 );
            TESTING_FREE_PIN( h_R2 );
            
            TESTING_FREE_CPU( ipiv  );
            TESTING_FREE_CPU( ipiv2 );
            
            TESTING_FREE_DEV( d_ipiv );
            TESTING_FREE_DEV( d_A1 );
            TESTING_FREE_DEV( d_A2 );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
Esempio n. 6
0
//
// Overloaded function for dispatching to
// * CUBLAS backend, and
// * double value-type.
//
inline void swap( const int n, double* x, const int incx, double* y,
        const int incy ) {
    cublasDswap( n, x, incx, y, incy );
}
Esempio n. 7
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
  // At least 2 arguments expected
  // Input and result
  if (nrhs!=6)
     mexErrMsgTxt("Wrong number of arguments");
  if (init == 0) {
    // Initialize function
    // mexLock();
    // load GPUmat
    gm = gmGetGPUmat();
    init = 1;
  }
  /* mex parameters are:
   0 Source array [X or one of the y-edge-zeroed copies of it]
   1 Destination array [Accumulator, basically]
   2 Stack array 1 [We swap XY or YZ planes with this before copying to assure clean shift] - Must be all zeroes and of size max(NxNy, NyNz, NxNz)
   3 Stack array 2
   4 Shift directions
   5 Coefficient on shift */

  // Get GPU array pointers
  GPUtype srcArray    = gm->gputype.getGPUtype(prhs[0]);
  GPUtype dstArray    = gm->gputype.getGPUtype(prhs[1]);
  GPUtype stackArrayX = gm->gputype.getGPUtype(prhs[2]);
//GPUtype stackArrayY = gm->gputype.getGPUtype(prhs[3]);
  GPUtype stackArrayZ = gm->gputype.getGPUtype(prhs[3]);

  // Get some control variables sorted out
  double *shiftdirs  = mxGetPr(prhs[4]);
  const int *dims    = gm->gputype.getSize(srcArray);

  double alpha       = *mxGetPr(prhs[5]);

  int shifts[3];
  shifts[0] = (int)shiftdirs[0];
  shifts[1] = (int)shiftdirs[1];
  shifts[2] = (int)shiftdirs[2];

  double *cubSrc = (double*)gm->gputype.getGPUptr(srcArray);

  // Remove appropriate YZ plane if any
  double *cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);

  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);

  // Remove appropriate XZ plane if any
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Remove appropriate XY plane if any
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);

  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // Decide the amount of offset to acheive desired shift
  int theta = shifts[0] + dims[0]*shifts[1] + dims[0]*dims[1]*shifts[2];
  int Ntot  = dims[0] * dims[1] * dims[2];

  cubDst = (double*)gm->gputype.getGPUptr(dstArray);
  if(theta >= 0) {
    cublasDaxpy(Ntot-theta, alpha, cubSrc,         1, cubDst + theta, 1);
  } else {
    cublasDaxpy(Ntot+theta, alpha, cubSrc - theta, 1, cubDst,         1);
  }

  // Replace the XY plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayZ);
  if(shifts[2] == -1) cublasDswap(dims[0]*dims[1], cubSrc,                               1, cubDst, 1);
  if(shifts[2] ==  1) cublasDswap(dims[0]*dims[1], cubSrc + dims[0]*dims[1]*(dims[2]-1), 1, cubDst, 1);

  // replace the XZ plane if it was removed
  //stackSwapXZplane(cubSrc, (double*)gm->gputype.getGPUptr(stackArrayY), (int *)dims, shifts);

  // Replace the YZ plane if it was removed
  cubDst = (double*)gm->gputype.getGPUptr(stackArrayX);
  if(shifts[0] == -1) cublasDswap(dims[1]*dims[2], cubSrc,             dims[0], cubDst, 1);
  if(shifts[0] ==  1) cublasDswap(dims[1]*dims[2], cubSrc + dims[0]-1, dims[0], cubDst, 1);
}