Пример #1
0
/*
   Generates Householder elementary reflector H = I - tau v v^T to reduce
     H [ dx0 ] = [ beta ]
       [ dx  ]   [ 0    ]
   with beta = ±norm( [dx0, dx] ) = ±dxnorm[0].
   Stores v over dx; first element of v is 1 and is not stored.
   Stores beta over dx0.
   Stores tau.

   The difference with LAPACK's clarfg is that the norm of dx, and hance beta,
   are computed outside the routine and passed to it in dxnorm (array on the GPU).
*/
extern "C" magma_err_t
magma_clarfgtx_gpu(int n, magmaFloatComplex_ptr dx0, size_t dx0_offset, magmaFloatComplex_ptr dx, size_t dx_offset,
                   magmaFloatComplex_ptr dtau, size_t dtau_offset, magmaFloat_ptr dxnorm, size_t dxnorm_offset, 
                   magmaFloatComplex_ptr dA, size_t dA_offset, int i, 
                   magmaFloatComplex_ptr V, size_t V_offset, int ldv, magmaFloatComplex_ptr T, size_t T_offset, int ldt, 
                   magmaFloatComplex_ptr work, size_t work_offset, 
                   magma_queue_t queue)
{
   /*  Generate the elementary reflector H(i)  */
   magma_clarfgx_gpu(n, dx0, dx0_offset, dx, dx_offset, dtau, dtau_offset, dxnorm, dxnorm_offset, dA, dA_offset, i, queue);

   if (i==0){
      magmaFloatComplex tt = MAGMA_C_ONE;
      magmablas_clacpy(MagmaFull, 1, 1, dtau, dtau_offset, 1, T, T_offset+i+i*ldt, 1, queue);
      magma_csetmatrix(1, 1, &tt, 0, 1, dx0, dx0_offset, 1, queue);
   }
   else
   {
       /* Compute the i-th column of T */      
      cl_int ciErrNum;                // Error code var
      cl_kernel ckKernel=NULL;
      ckKernel = rt->KernelPool["magma_cgemv_kernel3"];     // in clarfbx.cl
      
      if (!ckKernel)
      {
          printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__);
          return MAGMA_ERR_UNKNOWN;
      }
      
      int nn = 0;
      ciErrNum  = clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&n   );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&V );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&V_offset     );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&ldv   );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dx0 );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dx0_offset     );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&work );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&work_offset     );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dtau );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dtau_offset     );
      if (ciErrNum != CL_SUCCESS)
      {
          printf("Error: clSetKernelArg at %d in file %s!\n", __LINE__, __FILE__);
          return MAGMA_ERR_UNKNOWN;
      }

      size_t GlobalWorkSize[1]={0}, LocalWorkSize[1]={0};
    
      LocalWorkSize[0] = BLOCK_SIZE;
      GlobalWorkSize[0] = i*LocalWorkSize[0];
    
      // launch kernel
      ciErrNum = clEnqueueNDRangeKernel(
          queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
      if (ciErrNum != CL_SUCCESS)
      {
          printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n",
              __LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
          return MAGMA_ERR_UNKNOWN;
      }

        //magma_cgemv_kernel3<<< i, BLOCK_SIZE, 0, magma_stream >>>(n, V, ldv, dx0, work, dtau);
        
       clFlush(queue);  
        
      ckKernel = rt->KernelPool["magma_ctrmv_kernel2"];         // in clarfx.cl
      
      if (!ckKernel)
      {
          printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__);
          return MAGMA_ERR_UNKNOWN;
      }

      nn = 0;
      ciErrNum  = clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&T   );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&T_offset     );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&ldt   );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&work );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&work_offset     );
      magmaFloatComplex_ptr T1 = T;
      size_t T1_offset = T_offset + i*ldt;
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&T1 );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&T1_offset     );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(magmaFloatComplex_ptr), (void*)&dtau );
      ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(int), (void*)&dtau_offset     );
      if (ciErrNum != CL_SUCCESS)
      {
          printf("Error: clSetKernelArg at %d in file %s!\n", __LINE__, __FILE__);
          return MAGMA_ERR_UNKNOWN;
      }
    
      LocalWorkSize[0] = i;
      GlobalWorkSize[0] = i*LocalWorkSize[0];
    
      // launch kernel
      ciErrNum = clEnqueueNDRangeKernel(
          queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL);
      if (ciErrNum != CL_SUCCESS)
      {
          printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n",
              __LINE__, __FILE__, rt->GetErrorCode(ciErrNum));
          printf("block: %d,    group: %d\n", LocalWorkSize[0], GlobalWorkSize[0]);
          return MAGMA_ERR_UNKNOWN;
      }
      
      //magma_ctrmv_kernel2<<< i, i, 0, magma_stream          >>>( T, ldt, work, T+i*ldt, dtau);
      clFlush(queue);
    }
    return MAGMA_SUCCESS;
}
Пример #2
0
extern "C" magma_int_t
magma_cgeqr2x_gpu(magma_int_t *m, magma_int_t *n, magmaFloatComplex *dA,
                  magma_int_t *ldda, magmaFloatComplex *dtau,
                  magmaFloatComplex *dT, magmaFloatComplex *ddA,
                  float *dwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       December 2013

    Purpose
    =======
    CGEQR2 computes a QR factorization of a complex m by n matrix A:
    A = Q * R.

    This expert routine requires two more arguments than the standard
    cgeqr2, namely, dT and ddA, explained below. The storage for A is
    also not as in the LAPACK's cgeqr2 routine (see below).

    The first is used to output the triangular
    n x n factor T of the block reflector used in the factorization.
    The second holds the diagonal nxn blocks of A, i.e., the diagonal
    submatrices of R.

    This version implements the right-looking QR.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) COMPLEX array, dimension (LDA,N)
            On entry, the m by n matrix A.
            On exit, the unitary matrix Q as a
            product of elementary reflectors (see Further Details).

            the elements on and above the diagonal of the array
            contain the min(m,n) by n upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the unitary matrix Q as a
            product of elementary reflectors (see Further Details).

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    TAU     (output) COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    dT      (output) COMPLEX array, dimension N x N.
            Stores the triangular N x N factor T of the block reflector
            used in the factorization. The lower triangular part is 0.

    ddA     (output) COMPLEX array, dimension N x N.
            Stores the elements of the upper N x N diagonal block of A.
            LAPACK stores this array in A. There are 0s below the diagonal.

    WORK    (workspace) COMPLEX array, dimension (N)

    INFO    (output) INTEGER
            = 0: successful exit
            < 0: if INFO = -i, the i-th argument had an illegal value

    Further Details
    ===============
    The matrix Q is represented as a product of elementary reflectors

       Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).
    =====================================================================    */

    #define  da_ref(a_1,a_2) ( dA+(a_2)*(*ldda) + (a_1))
    
    magma_int_t i, k;

    float *dnorm = dwork;
    magmaFloatComplex *work = (magmaFloatComplex *)(dwork+2*(*n));

    *info = 0;
    if (*m < 0) {
        *info = -1;
    } else if (*n < 0) {
        *info = -2;
    } else if (*ldda < max(1,*m)) {
        *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Compute the norms of the trailing columns */
    k = min(*m,*n);
    magmablas_scnrm2_cols(*m, k, da_ref(0,0), *ldda, dnorm);

    for (i = 0; i < k; ++i) {
        /*  Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
        magma_clarfgx_gpu(*m-i, da_ref(i, i), da_ref(min(i+1,*m), i), dtau+i, dnorm+i,
                          ddA + i + i*(*n), i);
        
        if (i < *n) {
            /* Apply H(i)' to A(i:m,i+1:n) from the left */
            magma_clarfx_gpu(*m-i, *n-i-1, da_ref(i, i), dtau+i,
                             //da_ref(i, i+1), *ldda, dnorm+i+1,
                             da_ref(i, 0), *ldda, dnorm+i+1,
                             dT, i, work );
        }
    }

    return *info;
} /* magma_cgeqr2 */