예제 #1
0
int main( int argc, char** argv )
{
    
    magma_int_t n = 15000;
    magma_int_t lda  = n;
    magma_int_t ldx  = lda;
    magma_int_t nrhs = 1000;
    
    printf( "using MAGMA GPU interface\n" );

    GpuSolver g = GpuSolver(n, nrhs);

    // fill some data
    magmaDouble_ptr A=NULL, B=NULL;
    magma_dmalloc_cpu( &A, n*n );
    magma_dmalloc_cpu( &B, n*nrhs );
    for (int i = 0; i < 2; i++) {
        dfill_matrix( n, n, A, n );
        dfill_matrix( n, nrhs, B, n );    
        g.add_data( n, A, B );
    };


    magmaDouble_ptr X=NULL;
    magma_dmalloc_cpu( &X, n*nrhs );

    g.solve( X );

    magma_free_cpu( A );
    magma_free_cpu( B );
    magma_free_cpu( X );

    return 0;
}
예제 #2
0
// init a zero matrix on GPU to store X'*X
GpuSolver::GpuSolver ( int nn, int outs ) {

    magma_init();

    n = nn;
    nrhs = outs;

    magma_int_t i, j;
    magmaDouble_ptr A=NULL, B=NULL;

    magma_dmalloc_cpu( &A, n*n );
    magma_dmalloc_cpu( &B, n*nrhs );
    magma_dmalloc( &dA, n*n );
    magma_dmalloc( &dB, n*nrhs );
    if ( dA == NULL || dB == NULL || A == NULL || B == NULL ) {
        fprintf( stderr, "malloc failed - not enough GPU or system memory?\n" );
        goto cleanup;
    }

    for( j=0; j < n; ++j ) {
        for( i=0; i < n; ++i ) { A[i + j*n] = 0; }
    }
    for( j=0; j < nrhs; ++j ) {
        for( i=0; i < n; ++i ) { B[i + j*n] = 0; }
    }

    magma_dsetmatrix( n, n, A, n, dA, n );
    magma_dsetmatrix( n, nrhs, B, n, dB, n );

  cleanup:
    magma_free_cpu( A );
    magma_free_cpu( B );
};
예제 #3
0
extern "C" void
magma_dlarfxsym(
    magma_int_t N,
    double *A, magma_int_t LDA,
    double *V, double *TAU)
{
    magma_int_t IONE=1;
    double dtmp;
    double Z_ZERO =  MAGMA_D_ZERO;
    //double Z_ONE  =  MAGMA_D_ONE;
    double Z_MONE =  MAGMA_D_NEG_ONE;
    double Z_HALF =  MAGMA_D_HALF;
    //double WORK[N];
    double *WORK;
    magma_dmalloc_cpu( &WORK, N );

    /* apply left and right on A(st:ed,st:ed)*/
    //magma_dlarfxsym(len,A(st,st),LDX,V(st),TAU(st));
    /* X = AVtau */
    blasf77_dsymv("L",&N, TAU, A, &LDA, V, &IONE, &Z_ZERO, WORK, &IONE);
    /* je calcul dtmp= X'*V */
    dtmp = magma_cblas_ddot(N, WORK, IONE, V, IONE);
    /* je calcul 1/2 X'*V*t = 1/2*dtmp*tau  */
    dtmp = -dtmp * Z_HALF * (*TAU);
    /* je calcul W=X-1/2VX'Vt = X - dtmp*V */
    /*
    for (j = 0; j < N; j++)
        WORK[j] = WORK[j] + (dtmp*V[j]); */
    blasf77_daxpy(&N, &dtmp, V, &IONE, WORK, &IONE);
    /* performs the symmetric rank 2 operation A := alpha*x*y' + alpha*y*x' + A */
    blasf77_dsyr2("L",&N,&Z_MONE,WORK,&IONE,V,&IONE,A,&LDA);

    magma_free_cpu(WORK);
}
예제 #4
0
extern "C" void
magma_dtrdtype3cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    double *A, magma_int_t LDA,
    double *V, double *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    //magma_int_t    J1, J2, J3, i, j;
    magma_int_t    len, LDX;
    //magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    //double conjtmp;
    double *WORK;
    magma_dmalloc_cpu( &WORK, N );


    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    len    = ed-st+1;

    /* apply left and right on A(st:ed,st:ed)*/
    magma_dlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
    //conjtmp = MAGMA_D_CNJG(*TAU(taupos));
    //lapackf77_dlarfy("L", &len, V(vpos), &IONE,  &(MAGMA_D_CNJG(*TAU(taupos))), A(st,st), &LDX, WORK);
    magma_free_cpu(WORK);
}
예제 #5
0
 void SVDMatrix_magma(Tensor_core<complex<double>,2>& U, Tensor_core<double,1>& D, Tensor_core<complex<double>,2>& V)
 {
     if( U.rank(0)!=U.rank(1) || U.rank(1)!=D.rank(0) || D.rank(0)!=V.rank(0) || V.rank(0)!=V.rank(1) )
     {
         cout<<"Size is not consistent in SVDMatrix_magma! Only support square matrix."<<endl;
         exit(1);
     }

     magma_int_t m=U.rank(0); magma_int_t n=V.rank(0);
     magma_vec_t jobz(MagmaOverwriteVec); magma_int_t lda=m;
     magmaDoubleComplex* u=nullptr; magma_int_t ldu=1; magma_int_t ldv=n;
     magmaDoubleComplex work_test[1]; magma_int_t lwork=-1;

     double* rwork; magma_int_t* iwork;
     magma_dmalloc_cpu( &rwork, 5*m*m+7*m ); magma_imalloc_cpu(&iwork, 8*m); 
     magma_int_t info;

     magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, 
                  work_test, lwork, rwork, iwork, &info);

     lwork=lround( MAGMA_Z_REAL(work_test[0]) );
     magmaDoubleComplex* work; magma_zmalloc_cpu(&work, lwork);

     magma_zgesdd(jobz, m, n, (magmaDoubleComplex *) U.data(), lda, D.data(), u, ldu, (magmaDoubleComplex *) V.data(), ldv, 
                  work,      lwork, rwork, iwork, &info);

     magma_free_cpu(work); magma_free_cpu(rwork); magma_free_cpu(iwork);
     if(info!=0)
     {
         cout<<"SVDMatrix_magma is not suceesful, info= "<<info<<endl;
         exit(1);
     }
 }
예제 #6
0
magma_int_t magma_dnan_inf_gpu(
    magma_uplo_t uplo, magma_int_t m, magma_int_t n,
    magmaDouble_const_ptr dA, magma_int_t ldda,
    magma_int_t *cnt_nan,
    magma_int_t *cnt_inf )
{
    magma_int_t info = 0;
    if ( uplo != MagmaLower && uplo != MagmaUpper && uplo != MagmaFull )
        info = -1;
    else if ( m < 0 )
        info = -2;
    else if ( n < 0 )
        info = -3;
    else if ( ldda < max(1,m) )
        info = -5;

    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    magma_int_t lda = m;
    double* A;
    magma_dmalloc_cpu( &A, lda*n );
    magma_dgetmatrix( m, n, dA, ldda, A, lda );

    magma_int_t cnt = magma_dnan_inf( uplo, m, n, A, lda, cnt_nan, cnt_inf );

    magma_free_cpu( A );
    return cnt;
}
예제 #7
0
extern "C" void magma_dtrdtype1cbHLsym_withQ(magma_int_t N, magma_int_t NB, double *A, magma_int_t LDA, double *V, double *TAU, magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz) {
  //magma_int_t    J1, J2, J3, i, j;
  magma_int_t    len, LDX;
  magma_int_t    IONE=1;
  magma_int_t    blkid, vpos, taupos, tpos;
  //double conjtmp;
  double Z_ONE  =  MAGMA_D_ONE;
  double *WORK;
  magma_dmalloc_cpu( &WORK, N );


  findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
  //printf("voici vpos %d taupos %d  tpos %d  blkid %d \n", vpos, taupos, tpos, blkid);
  LDX     = LDA-1;
  len     = ed-st+1;
  *V(vpos)  = Z_ONE;
  memcpy(V(vpos+1), A(st+1, st-1), (len-1)*sizeof(double));
  memset(A(st+1, st-1), 0, (len-1)*sizeof(double));
  /* Eliminate the col  at st-1 */
  lapackf77_dlarfg( &len, A(st, st-1), V(vpos+1), &IONE, TAU(taupos) );
  /* apply left and right on A(st:ed,st:ed)*/
  magma_dlarfxsym(len,A(st,st),LDX,V(vpos),TAU(taupos));
  //conjtmp = MAGMA_D_CNJG(*TAU(taupos));
  //lapackf77_dlarfy("L", &len, V(vpos), &IONE, &conjtmp, A(st,st), &LDX, WORK); //&(MAGMA_D_CNJG(*TAU(taupos)))
  magma_free_cpu(WORK);
}
예제 #8
0
/*-------------------------------------------------------------------
 * Check the orthogonality of Q
 */
static magma_int_t check_orthogonality(magma_int_t M, magma_int_t N, double *Q, magma_int_t LDQ, double eps)
{
    double  done  =  1.0;
    double  mdone = -1.0;
    double c_zero    = MAGMA_D_ZERO;
    double c_one     = MAGMA_D_ONE;
    double  normQ, result;
    magma_int_t     info_ortho;
    magma_int_t     minMN = min(M, N);
    double *work;
    magma_dmalloc_cpu( &work, minMN );

    /* Build the idendity matrix */
    double *Id;
    magma_dmalloc_cpu( &Id, minMN*minMN );
    lapackf77_dlaset("A", &minMN, &minMN, &c_zero, &c_one, Id, &minMN);

    /* Perform Id - Q'Q */
    if (M >= N)
        blasf77_dsyrk("U", "C", &N, &M, &done, Q, &LDQ, &mdone, Id, &N);
    else
        blasf77_dsyrk("U", "N", &M, &N, &done, Q, &LDQ, &mdone, Id, &M);

    normQ = lapackf77_dlansy("I", "U", &minMN, Id, &minMN, work);

    result = normQ / (minMN * eps);
    printf( "  %12.2e", result*eps );
    //printf(" ======================================================\n");
    //printf(" ||Id-Q'*Q||_oo / (minMN*eps)          : %15.3E \n",  result );
    //printf(" ======================================================\n");

    if ( isnan(result) || isinf(result) || (result > 60.0) ) {
        //printf("-- Orthogonality is suspicious ! \n");
        info_ortho=1;
    }
    else {
        //printf("-- Orthogonality is CORRECT ! \n");
        info_ortho=0;
    }
    magma_free_cpu(work);
    magma_free_cpu(Id);
    return info_ortho;
}
예제 #9
0
extern "C" magma_int_t
magma_dvinit(
    magma_d_matrix *x,
    magma_location_t mem_loc,
    magma_int_t num_rows,
    magma_int_t num_cols,
    double values,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    
    x->val = NULL;
    x->diag = NULL;
    x->row = NULL;
    x->rowidx = NULL;
    x->col = NULL;
    x->list = NULL;
    x->blockinfo = NULL;
    x->dval = NULL;
    x->ddiag = NULL;
    x->drow = NULL;
    x->drowidx = NULL;
    x->dcol = NULL;
    x->dlist = NULL;
    x->storage_type = Magma_DENSE;
    x->memory_location = mem_loc;
    x->sym = Magma_GENERAL;
    x->diagorder_type = Magma_VALUE;
    x->fill_mode = MagmaFull;
    x->num_rows = num_rows;
    x->num_cols = num_cols;
    x->nnz = num_rows*num_cols;
    x->max_nnz_row = num_cols;
    x->diameter = 0;
    x->blocksize = 1;
    x->numblocks = 1;
    x->alignment = 1;
    x->major = MagmaColMajor;
    x->ld = num_rows;
    if ( mem_loc == Magma_CPU ) {
        CHECK( magma_dmalloc_cpu( &x->val, x->nnz ));
        for( magma_int_t i=0; i<x->nnz; i++) {
             x->val[i] = values;
        }
    }
    else if ( mem_loc == Magma_DEV ) {
        CHECK( magma_dmalloc( &x->val, x->nnz ));
        magmablas_dlaset( MagmaFull, x->num_rows, x->num_cols, values, values, x->val, x->num_rows, queue );
    }
    
cleanup:
    return info; 
}
예제 #10
0
magma_int_t
magma_dmLdiagadd(
    magma_d_matrix *L,
    magma_queue_t queue )
{
    magma_int_t info = 0;

    magma_d_matrix LL={Magma_CSR};

    if( L->row[1]==1 ){        // lower triangular with unit diagonal
        //printf("L lower triangular.\n");
        LL.diagorder_type = Magma_UNITY;
        CHECK( magma_dmconvert( *L, &LL, Magma_CSR, Magma_CSRL, queue ));
    }
    else if( L->row[1]==0 ){ // strictly lower triangular
        //printf("L strictly lower triangular.\n");
        CHECK( magma_dmtransfer( *L, &LL, Magma_CPU, Magma_CPU, queue ));
        magma_free_cpu( LL.col );
        magma_free_cpu( LL.val );
        LL.nnz = L->nnz+L->num_rows;
        CHECK( magma_dmalloc_cpu( &LL.val, LL.nnz ));
        CHECK( magma_index_malloc_cpu( &LL.col, LL.nnz ));
        magma_int_t z=0;
        for( magma_int_t i=0; i<L->num_rows; i++){
            LL.row[i] = z;
            for( magma_int_t j=L->row[i]; j<L->row[i+1]; j++){
                LL.val[z] = L->val[j];
                LL.col[z] = L->col[j];
                z++;
            }
            // add unit diagonal
            LL.val[z] = MAGMA_D_MAKE(1.0, 0.0);
            LL.col[z] = i;
            z++;
        }
        LL.row[LL.num_rows] = z;
        LL.nnz = z;
    }
    else{
        printf("error: L neither lower nor strictly lower triangular!\n");
    }
    magma_dmfree( L, queue );
    CHECK( magma_dmtransfer(LL, L, Magma_CPU, Magma_CPU, queue ));

cleanup:
    if( info != 0 ){
        magma_dmfree( L, queue );
    }
    magma_dmfree( &LL, queue );
    return info;
}
예제 #11
0
void magma_dprint_gpu( magma_int_t m, magma_int_t n, const double *dA, magma_int_t ldda )
{
    if ( magma_is_devptr( dA ) == 0 ) {
        fprintf( stderr, "ERROR: dprint_gpu called with host pointer.\n" );
        exit(1);
    }
    
    magma_int_t lda = m;
    double* A;
    magma_dmalloc_cpu( &A, lda*n );
    magma_dgetmatrix( m, n, dA, ldda, A, lda );
    magma_dprint( m, n, A, lda );
    magma_free_cpu( A );
}
예제 #12
0
extern "C" magma_int_t
magma_deigensolverinfo_init(
    magma_d_solver_par *solver_par,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    double *initial_guess=NULL;
    solver_par->eigenvectors = NULL;
    solver_par->eigenvalues = NULL;
    if ( solver_par->solver == Magma_LOBPCG ) {
        CHECK( magma_dmalloc_cpu( &solver_par->eigenvalues ,
                                3*solver_par->num_eigenvalues ));
        // setup initial guess EV using lapack
        // then copy to GPU
        magma_int_t ev = solver_par->num_eigenvalues * solver_par->ev_length;

        CHECK( magma_dmalloc_cpu( &initial_guess, ev ));
        CHECK( magma_dmalloc( &solver_par->eigenvectors, ev ));
        magma_int_t ISEED[4] = {0,0,0,1}, ione = 1;
        lapackf77_dlarnv( &ione, ISEED, &ev, initial_guess );

        magma_dsetmatrix( solver_par->ev_length, solver_par->num_eigenvalues,
            initial_guess, solver_par->ev_length, solver_par->eigenvectors,
                                                    solver_par->ev_length );
    } else {
        solver_par->eigenvectors = NULL;
        solver_par->eigenvalues = NULL;
    }

cleanup:
    if( info != 0 ){
        magma_free( solver_par->eigenvectors );
        magma_free( solver_par->eigenvalues );
    }
    magma_free_cpu( initial_guess );
    return info;
}
예제 #13
0
magma_int_t
magma_dvread(
    magma_d_matrix *x,
    magma_int_t length,
    char * filename,
    magma_queue_t queue )
{
    magma_int_t info = 0;
    
    magma_int_t nnz=0, i=0;
    FILE *fid;
    
    x->memory_location = Magma_CPU;
    x->storage_type = Magma_DENSE;
    x->num_rows = length;
    x->num_cols = 1;
    x->major = MagmaColMajor;
    CHECK( magma_dmalloc_cpu( &x->val, length ));
    
    fid = fopen(filename, "r");
    
    while( i<length )  // eof() is 'true' at the end of data
    {
        double VAL1;

        double VAL;
        #define REAL
        
        #ifdef COMPLEX
            double VAL2;
            fscanf(fid, " %lf %lf \n", &VAL1, &VAL2);
            VAL = MAGMA_D_MAKE(VAL1, VAL2);
        #else
            fscanf(fid, " %lf \n", &VAL1);
            VAL = MAGMA_D_MAKE(VAL1, 0.0);
        #endif
        
        if ( VAL != MAGMA_D_ZERO )
            nnz++;
        x->val[i] = VAL;
        i++;
    }
    fclose(fid);
    
    x->nnz = nnz;
    
cleanup:
    return info;
}
예제 #14
0
double *generate_sym_matrix(int m) {
 	double *a;
 	magma_int_t i,j;
 	magma_int_t mm = m*m;
 	magma_err_t err;
 	err = magma_dmalloc_cpu ( &a , mm );

 	magma_int_t ione = 1;
 	magma_int_t ISEED [4] = {0 ,0 ,0 ,1};
 	lapackf77_dlarnv (&ione ,ISEED ,&mm ,a);
 	for(i=0; i<m; i++) {
 		MAGMA_D_SET2REAL (a[i*m+i],( MAGMA_D_REAL (a[i*m+i ])+1.* m ) );
 		for (j=0; j<i; j++)
 			a[i*m+j] = (a[j*m+i]);
 	} 

 	return a;
}
예제 #15
0
double *cholesky_gpu(double *ml, int m) {
 	magma_int_t mm = m*m;
 	magma_int_t info;
 	double *a;
 	double *d_a ;
 	magma_err_t err; 
 	err = magma_dmalloc_cpu ( &a , mm );
 	err = magma_dmalloc ( &d_a , mm );

 	magma_dsetmatrix ( m, m, ml, m, d_a , m );

 	magma_dpotrf_gpu('L',m,d_a,m,&info);
 	magma_dpotri_gpu('L',m,d_a,m,&info);

 	magma_dgetmatrix ( m, m, d_a , m, a, m );
 	magma_free (d_a );
 
 	return a;
}
예제 #16
0
extern "C" void
magma_dtrdtype2cbHLsym_withQ(
    magma_int_t N, magma_int_t NB,
    double *A, magma_int_t LDA,
    double *V, double *TAU,
    magma_int_t st, magma_int_t ed, magma_int_t sweep, magma_int_t Vblksiz)
{
    magma_int_t    J1, J2, len, lem, LDX;
    //magma_int_t    i, j;
    magma_int_t    IONE=1;
    magma_int_t    blkid, vpos, taupos, tpos;
    double conjtmp;
    double Z_ONE  =  MAGMA_D_ONE;
    //double WORK[NB];
    double *WORK;
    magma_dmalloc_cpu( &WORK, NB );


    findVTpos(N,NB,Vblksiz,sweep-1,st-1, &vpos, &taupos, &tpos, &blkid);
    LDX    = LDA-1;
    J1     = ed+1;
    J2     = min(ed+NB,N);
    len    = ed-st+1;
    lem    = J2-J1+1;
    if (lem > 0) {
        /* apply remaining right commming from the top block */
        lapackf77_dlarfx("R", &lem, &len, V(vpos), TAU(taupos), A(J1, st), &LDX, WORK);
    }
    if (lem > 1) {
        findVTpos(N,NB,Vblksiz,sweep-1,J1-1, &vpos, &taupos, &tpos, &blkid);
        /* remove the first column of the created bulge */
        *V(vpos)  = Z_ONE;
        memcpy(V(vpos+1), A(J1+1, st), (lem-1)*sizeof(double));
        memset(A(J1+1, st),0,(lem-1)*sizeof(double));
        /* Eliminate the col at st */
        lapackf77_dlarfg( &lem, A(J1, st), V(vpos+1), &IONE, TAU(taupos) );
        /* apply left on A(J1:J2,st+1:ed) */
        len = len-1; /* because we start at col st+1 instead of st. col st is the col that has been revomved; */
        conjtmp = MAGMA_D_CNJG(*TAU(taupos));
        lapackf77_dlarfx("L", &lem, &len, V(vpos),  &conjtmp, A(J1, st+1), &LDX, WORK);
    }
    magma_free_cpu(WORK);
}
예제 #17
0
 void eigen_magma(Tensor_core<double,2>& A, Tensor_core<double,1>& W, char JOBZ, char UPLO)
 {
     if( A.rank(0) != A.rank(1) ) {cout<<"Input for eigen is not square matrix!"<<endl; exit(1);}
     if( A.rank(0) != W.rank(0) ) {cout<<"Input size of W is not consistent with A!"<<endl; exit(1);}

     magma_vec_t jobz = magma_vec_const(JOBZ); magma_uplo_t uplo = magma_uplo_const(UPLO);
     magma_int_t N=A.rank(0); magma_int_t info;

     double work_test[1]; magma_int_t iwork_test[1];
     magma_int_t lwork=-1; magma_int_t liwork=-1;
     magma_dsyevd( jobz, uplo, N, A.data(), N, W.data(), work_test, lwork, iwork_test, liwork, &info );

     lwork=lround(work_test[0]); liwork=iwork_test[0];
     double* work; magma_int_t* iwork;
     magma_dmalloc_cpu(&work, lwork); magma_imalloc_cpu(&iwork, liwork);
     magma_dsyevd( jobz, uplo, N, A.data(), N, W.data(), work, lwork, iwork, liwork, &info );
     magma_free_cpu(work); magma_free_cpu(iwork);

     if(info!=0) {cout<<"Dsyevd failed: info= "<< info<<endl; exit(1);}
 }
예제 #18
0
int main(int argc, char** argv) {
 	magma_init();
 	magma_timestr_t start , end;
 	double gpu_time ;
 	double *c;
 	int dim[] = {20000,30000,40000};
 	int i,n;
 	n = sizeof(dim) / sizeof(dim[0]);
 
 	for(i=0; i < n; i++) {
 		magma_int_t m = dim[i];
 		magma_int_t mm=m*m;
 		magma_err_t err;

 		err = magma_dmalloc_cpu ( &c , mm );
 		//generate random symmetric, positive matrix
 		double *ml = generate_sym_matrix(m);
 
 		start = get_current_time();
 		
 		//find the inverse matrix for MxM symmetric, positive definite matrix using the cholesky decomposition.
 		//Compute GPU cholesky decomposition with CPU interface
 		c = cholesky(ml, m);
 
 		end = get_current_time();
 
 		gpu_time = GetTimerValue(start,end)/1e3;
 
 		printf("gpu time for %dx%d: %7.5f sec\n", m, m, gpu_time);

 		//copy upper diag
 		copy_upper_diag(c,m);

 		free(c);
 	}
 
 	magma_finalize ();
 	return 0;
}
예제 #19
0
/**
    Purpose
    -------
    DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by DGEQRF.

    This version recomputes the T matrices on the CPU and sends them to the GPU.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       DOUBLE_PRECISION array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by DGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     DOUBLE_PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF_GPU.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument has an illegal value

    @ingroup magma_dgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dorgqr2(magma_int_t m, magma_int_t n, magma_int_t k,
              double *A, magma_int_t lda,
              double *tau,
              magma_int_t *info)
{
#define  A(i,j) ( A + (i) + (j)*lda )
#define dA(i,j) (dA + (i) + (j)*ldda)

    double c_zero = MAGMA_D_ZERO;
    double c_one  = MAGMA_D_ONE;

    magma_int_t nb = magma_get_dgeqrf_nb(min(m, n));

    magma_int_t  m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldda;
    magma_int_t i, ib, ki, kk;  //, iinfo;
    magma_int_t lddwork;
    double *dA, *dV, *dW, *dT, *T;
    double *work;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }

    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate GPU work space
    // ldda*n     for matrix dA
    // ldda*nb    for dV
    // lddwork*nb for dW larfb workspace
    ldda    = ((m + 31) / 32) * 32;
    lddwork = ((n + 31) / 32) * 32;
    if (MAGMA_SUCCESS != magma_dmalloc( &dA, ldda*n + ldda*nb + lddwork*nb + nb*nb)) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    dV = dA + ldda*n;
    dW = dA + ldda*n + ldda*nb;
    dT = dA + ldda*n + ldda*nb + lddwork*nb;

    // Allocate CPU work space
    lwork = (n+m+nb) * nb;
    magma_dmalloc_cpu( &work, lwork );

    T = work;

    if (work == NULL) {
        magma_free( dA );
        magma_free_cpu( work );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    double *V = work + (n+nb)*nb;

    magma_queue_t stream;
    magma_queue_create( &stream );

    // Use unblocked code for the last or only block.
    if (kk < n) {
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        /*
            lapackf77_dorgqr( &m_kk, &n_kk, &k_kk,
                              A(kk, kk), &lda,
                              &tau[kk], work, &lwork, &iinfo );
        */
        lapackf77_dlacpy( MagmaUpperLowerStr, &m_kk, &k_kk, A(kk,kk), &lda, V, &m_kk);
        lapackf77_dlaset( MagmaUpperLowerStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda );

        lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &k_kk,
                          V, &m_kk, &tau[kk], work, &k_kk);
        lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &n_kk, &k_kk,
                          V, &m_kk, work, &k_kk, A(kk, kk), &lda, work+k_kk*k_kk, &n_kk );
        
        if (kk > 0) {
            magma_dsetmatrix( m_kk, n_kk,
                              A(kk, kk),  lda,
                              dA(kk, kk), ldda );
        
            // Set A(1:kk,kk+1:n) to zero.
            magmablas_dlaset( MagmaFull, kk, n - kk, c_zero, c_zero, dA(0, kk), ldda );
        }
    }

    if (kk > 0) {
        // Use blocked code
        // stream: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        magmablasSetKernelStream( stream );
        
        for (i = ki; i >= 0; i -= nb) {
            ib = min(nb, k - i);

            // Send current panel to the GPU
            mi = m - i;
            lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            magma_dsetmatrix_async( mi, ib,
                                    A(i, i), lda,
                                    dV,      ldda, stream );
            lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &mi, &ib,
                              A(i,i), &lda, &tau[i], T, &nb);
            magma_dsetmatrix_async( ib, ib,
                                    T, nb,
                                    dT, nb, stream );

            // set panel to identity
            magmablas_dlaset( MagmaFull, i,  ib, c_zero, c_zero, dA(0, i), ldda );
            magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one,  dA(i, i), ldda );
            
            magma_queue_sync( stream );
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                  mi, n-i, ib,
                                  dV,       ldda, dT, nb,
                                  dA(i, i), ldda, dW, lddwork );
            }
        }
    
        // copy result back to CPU
        magma_dgetmatrix( m, n,
                          dA(0, 0), ldda, A(0, 0), lda);
    }

    magmablasSetKernelStream( NULL );
    magma_queue_destroy( stream );
    magma_free( dA );
    magma_free_cpu( work );

    return *info;
} /* magma_dorgqr */
예제 #20
0
파일: dorgqr.cpp 프로젝트: kjbartel/clmagma
extern "C" magma_int_t
magma_dorgqr(
    magma_int_t m, magma_int_t n, magma_int_t k,
    double *a, magma_int_t lda,
    double *tau, magmaDouble_ptr dT, size_t dT_offset,
    magma_int_t nb,
    magma_queue_t queue,
    magma_int_t *info )
{
/*  -- clMAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date November 2014

    Purpose
    =======
    DORGQR generates an M-by-N DOUBLE_PRECISION matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

          Q  =  H(1) H(2) . . . H(k)

    as returned by DGEQRF.

    Arguments
    =========
    M       (input) INTEGER
            The number of rows of the matrix Q. M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    K       (input) INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    A       (input/output) DOUBLE_PRECISION array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by DGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    LDA     (input) INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    TAU     (input) DOUBLE_PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF_GPU.

    DT      (input) DOUBLE_PRECISION array on the GPU device.
            DT contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_dgeqrf_gpu.

    NB      (input) INTEGER
            This is the block size used in DGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in DT.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument has an illegal value
    =====================================================================    */

    #define  a_ref(i,j)     ( a + (j)*lda  + (i))
    #define da_ref(i,j)     da, (da_offset + (j)*ldda + (i))
    #define t_ref(a_1)      dT, (dT_offset + (a_1)*nb)

    double c_zero = MAGMA_D_ZERO;
    
    magma_int_t  i__1, i__2, i__3;
    magma_int_t lwork, ldda;
    magma_int_t i, ib, ki, kk, iinfo;
    magma_int_t lddwork = min(m, n);
    double *work;
    magmaDouble_ptr da, dwork;
    size_t da_offset, dwork_offset;
    magma_event_t event = NULL;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0)
      return *info;

    /* Allocate GPU work space */
    ldda = ((m+31)/32)*32;
    lddwork = ((lddwork+31)/32)*32;
    if (MAGMA_SUCCESS != magma_dmalloc( &da, ((n)*ldda + nb*lddwork ) )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    da_offset = 0;
    dwork = da;
    dwork_offset = da_offset + (n)*ldda;

    /* Allocate CPU work space */
    lwork = n * nb;
    magma_dmalloc_cpu( &work, lwork );
    if( work == NULL ) {
        magma_free( da );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }

    if ( (nb > 1) && (nb < k) )
      {
        /*  Use blocked code after the last block.
            The first kk columns are handled by the block method. */
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);

        /* Set A(1:kk,kk+1:n) to zero. */
        magmablas_dlaset(MagmaFull, kk, n-kk, c_zero, c_zero, da_ref(0,kk), ldda, queue);
      }
    else
      kk = 0;

    /* Use unblocked code for the last or only block. */
    if (kk < n)
      {
        i__1 = m - kk;
        i__2 = n - kk;
        i__3 = k - kk;
        lapackf77_dorgqr(&i__1, &i__2, &i__3,
                         a_ref(kk, kk), &lda,
                         &tau[kk], work, &lwork, &iinfo);
        
        magma_dsetmatrix(i__1, i__2, a_ref(kk, kk), lda, da_ref(kk, kk), ldda, queue);
      }

    if (kk > 0)
      {
        /* Use blocked code */
        for (i = ki; i >= 0; i-=nb)
          {
            ib = min(nb, k - i);

            /* Send the current panel to the GPU */
            i__2 = m - i;
            dpanel_to_q(MagmaUpper, ib, a_ref(i,i), lda, work);
            magma_dsetmatrix(i__2, ib, a_ref(i, i), lda, da_ref(i, i), ldda, queue);
                             
            if (i + ib < n)
              {
                /* Apply H to A(i:m,i+ib:n) from the left */
                i__3 = n - i - ib;
                magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                  i__2, i__3, ib,
                                  da_ref(i, i   ), ldda, t_ref(i),      nb,
                                  da_ref(i, i+ib), ldda,    dwork, dwork_offset, lddwork, queue);
              }

            /* Apply H to rows i:m of current block on the CPU */
            lapackf77_dorgqr(&i__2, &ib, &ib,
                             a_ref(i, i), &lda,
                             &tau[i], work, &lwork, &iinfo);
            magma_dsetmatrix_async( i__2, ib,
                                    a_ref(i,i), lda,
                                    da_ref(i,i), ldda, queue, &event );

            /* Set rows 1:i-1 of current block to zero */
            i__2 = i + ib;
            magmablas_dlaset(MagmaFull, i, i__2 - i, c_zero, c_zero, da_ref(0,i), ldda, queue);
          }
      }
    
    magma_dgetmatrix(m, n, da_ref(0, 0), ldda, a_ref(0, 0), lda, queue);
    
    //cudaStreamDestroy(stream);
    magma_free( da );
    magma_free_cpu(work);

    return *info;
} /* magma_dorgqr */
예제 #21
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Debugging file
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    magma_d_solver_par solver_par;
    magma_d_preconditioner precond_par;
    solver_par.epsilon = 10e-16;
    solver_par.maxiter = 1000;
    solver_par.verbose = 0;
    precond_par.solver = Magma_JACOBI;

    magma_dsolverinfo_init( &solver_par, &precond_par );
    
    double one = MAGMA_D_MAKE(1.0, 0.0);
    double zero = MAGMA_D_MAKE(0.0, 0.0);

    magma_d_sparse_matrix A, B, B_d;
    magma_d_vector x, b;

    // generate matrix of desired structure and size
    magma_int_t n=10;   // size is n*n
    magma_int_t nn = n*n;
    magma_int_t offdiags = 2;
    magma_index_t *diag_offset;
    double *diag_vals;
    magma_dmalloc_cpu( &diag_vals, offdiags+1 );
    magma_index_malloc_cpu( &diag_offset, offdiags+1 );
    diag_offset[0] = 0;
    diag_offset[1] = 1;
    diag_offset[2] = n;
    diag_vals[0] = MAGMA_D_MAKE( 4.0, 0.0 );
    diag_vals[1] = MAGMA_D_MAKE( -1.0, 0.0 );
    diag_vals[2] = MAGMA_D_MAKE( -1.0, 0.0 );
    magma_dmgenerator( nn, offdiags, diag_offset, diag_vals, &A );

    // convert marix into desired format
    B.storage_type = Magma_SELLC;
    B.blocksize = 8;
    B.alignment = 8;
    // scale matrix
    magma_dmscale( &A, Magma_UNITDIAG );

    magma_d_mconvert( A, &B, Magma_CSR, B.storage_type );
    magma_d_mtransfer( B, &B_d, Magma_CPU, Magma_DEV );

    // vectors and initial guess
    magma_d_vinit( &b, Magma_DEV, A.num_cols, one );
    magma_d_vinit( &x, Magma_DEV, A.num_cols, one );
    magma_d_spmv( one, B_d, x, zero, b );                 //  b = A x
    magma_d_vfree(&x);
    magma_d_vinit( &x, Magma_DEV, A.num_cols, zero );

    // solver
    magma_dpcg( B_d, b, &x, &solver_par, &precond_par );

    // solverinfo
    magma_dsolverinfo( &solver_par, &precond_par );

    magma_dsolverinfo_free( &solver_par, &precond_par );

    magma_d_mfree(&B_d);
    magma_d_mfree(&B);
    magma_d_mfree(&A); 
    magma_d_vfree(&x);
    magma_d_vfree(&b);

    TESTING_FINALIZE();
    return 0;
}
예제 #22
0
extern "C" magma_int_t
magma_dlobpcg( magma_d_sparse_matrix A, magma_d_solver_par *solver_par ){


#define  residualNorms(i,iter)  ( residualNorms + (i) + (iter)*n )
#define magmablas_swap(x, y)    { pointer = x; x = y; y = pointer; }
#define hresidualNorms(i,iter)  (hresidualNorms + (i) + (iter)*n )

#define gramA(    m, n)   (gramA     + (m) + (n)*ldgram)
#define gramB(    m, n)   (gramB     + (m) + (n)*ldgram)
#define gevectors(m, n)   (gevectors + (m) + (n)*ldgram) 
#define h_gramB(  m, n)   (h_gramB   + (m) + (n)*ldgram)

#define magma_d_bspmv_tuned(m, n, alpha, A, X, beta, AX)       {        \
            magmablas_dtranspose( m, n, X, m, blockW, n );        	\
            magma_d_vector x, ax;                                       \
            x.memory_location = Magma_DEV;  x.num_rows = m*n;  x.nnz = m*n;  x.val = blockW; \
            ax.memory_location= Magma_DEV; ax.num_rows = m*n; ax.nnz = m*n; ax.val = AX;     \
            magma_d_spmv(alpha, A, x, beta, ax );                           \
            magmablas_dtranspose( n, m, blockW, n, X, m );            		\
}




//**************************************************************

    // Memory allocation for the eigenvectors, eigenvalues, and workspace
    solver_par->solver = Magma_LOBPCG;
    magma_int_t m = A.num_rows;
    magma_int_t n =(solver_par->num_eigenvalues);
    double *blockX = solver_par->eigenvectors;
    double *evalues = solver_par->eigenvalues;


    double *dwork, *hwork;
    double *blockP, *blockAP, *blockR, *blockAR, *blockAX, *blockW;
    double *gramA, *gramB, *gramM;
    double *gevectors, *h_gramB;

    double *pointer, *origX = blockX;
    double *eval_gpu;

    magma_int_t lwork = max( 2*n+n*magma_get_dsytrd_nb(n),
                                            1 + 6*3*n + 2* 3*n* 3*n);

    magma_dmalloc_pinned( &hwork   ,        lwork );
    magma_dmalloc(        &blockAX   ,        m*n );
    magma_dmalloc(        &blockAR   ,        m*n );
    magma_dmalloc(        &blockAP   ,        m*n );
    magma_dmalloc(        &blockR    ,        m*n );
    magma_dmalloc(        &blockP    ,        m*n );
    magma_dmalloc(        &blockW    ,        m*n );
    magma_dmalloc(        &dwork     ,        m*n );            
    magma_dmalloc(        &eval_gpu  ,        3*n );




//**********************************************************+

    magma_int_t verbosity = 1;
    magma_int_t *iwork, liwork = 15*n+9;

    // === Set solver parameters ===
    double residualTolerance  = solver_par->epsilon;
    magma_int_t maxIterations = solver_par->maxiter;

    // === Set some constants & defaults ===
    double c_one = MAGMA_D_ONE, c_zero = MAGMA_D_ZERO;

    double *residualNorms, *condestGhistory, condestG;
    double *gevalues;
    magma_int_t *activeMask;

    // === Check some parameters for possible quick exit ===
    solver_par->info = 0;
    if (m < 2)
        solver_par->info = -1;
    else if (n > m)
        solver_par->info = -2;

    if (solver_par->info != 0) {
        magma_xerbla( __func__, -(solver_par->info) );
        return solver_par->info;
    }
    magma_int_t *info = &(solver_par->info); // local info variable;

    // === Allocate GPU memory for the residual norms' history ===
    magma_dmalloc(&residualNorms, (maxIterations+1) * n);
    magma_malloc( (void **)&activeMask, (n+1) * sizeof(magma_int_t) );

    // === Allocate CPU work space ===
    magma_dmalloc_cpu(&condestGhistory, maxIterations+1);
    magma_dmalloc_cpu(&gevalues, 3 * n);
    magma_malloc_cpu((void **)&iwork, liwork * sizeof(magma_int_t));

    double *hW;
    magma_dmalloc_pinned(&hW, n*n);
    magma_dmalloc_pinned(&gevectors, 9*n*n); 
    magma_dmalloc_pinned(&h_gramB  , 9*n*n);

    // === Allocate GPU workspace ===
    magma_dmalloc(&gramM, n * n);
    magma_dmalloc(&gramA, 9 * n * n);
    magma_dmalloc(&gramB, 9 * n * n);

    #if defined(PRECISION_z) || defined(PRECISION_c)
    double *rwork;
    magma_int_t lrwork = 1 + 5*(3*n) + 2*(3*n)*(3*n);

    magma_dmalloc_cpu(&rwork, lrwork);
    #endif

    // === Set activemask to one ===
    for(int k =0; k<n; k++)
        iwork[k]=1;
    magma_setmatrix(n, 1, sizeof(magma_int_t), iwork, n ,activeMask, n);

    magma_int_t gramDim, ldgram  = 3*n, ikind = 4;
       
    // === Make the initial vectors orthonormal ===
    magma_dgegqr_gpu(ikind, m, n, blockX, m, dwork, hwork, info );
    //magma_dorthomgs( m, n, blockX );
    
    magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );

    // === Compute the Gram matrix = (X, AX) & its eigenstates ===
    magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_dsyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, evalues, hW, n, hwork, lwork,
                      #if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
                      #endif
                      iwork, liwork, info );

    // === Update  X =  X * evectors ===
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockX);

    // === Update AX = AX * evectors ===
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one,  blockAX, m, gramM, n, c_zero, blockW, m);
    magmablas_swap(blockW, blockAX);

    condestGhistory[1] = 7.82;
    magma_int_t iterationNumber, cBlockSize, restart = 1, iter;

    //Chronometry
    real_Double_t tempo1, tempo2;
    magma_device_sync(); tempo1=magma_wtime();
    // === Main LOBPCG loop ============================================================
    for(iterationNumber = 1; iterationNumber < maxIterations; iterationNumber++)
        { 
            // === compute the residuals (R = Ax - x evalues )
            magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);

/*
            for(int i=0; i<n; i++){
               magma_daxpy(m, MAGMA_D_MAKE(-evalues[i],0), blockX+i*m, 1, blockR+i*m, 1);
            }
  */        
            #if defined(PRECISION_z) || defined(PRECISION_d)
                magma_dsetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
            #else
                magma_ssetmatrix( 3*n, 1, evalues, 3*n, eval_gpu, 3*n );
            #endif

            magma_dlobpcg_res( m, n, eval_gpu, blockX, blockR, eval_gpu);

            magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

            // === remove the residuals corresponding to already converged evectors
            magma_dcompact(m, n, blockR, m,
                           residualNorms(0, iterationNumber), residualTolerance, 
                           activeMask, &cBlockSize);
            
            if (cBlockSize == 0)
                break;
        
            // === apply a preconditioner P to the active residulas: R_new = P R_old
            // === for now set P to be identity (no preconditioner => nothing to be done )
            // magmablas_dlacpy( MagmaUpperLower, m, cBlockSize, blockR, m, blockW, m);

            /*
            // === make the preconditioned residuals orthogonal to X
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m,
                        c_one, blockX, m, blockR, m, c_zero, gramB(0,0), ldgram);
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                        c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockR, m);
            */

            // === make the active preconditioned residuals orthonormal
            magma_dgegqr_gpu(ikind, m, cBlockSize, blockR, m, dwork, hwork, info );
            //magma_dorthomgs( m, cBlockSize, blockR );
            
            // === compute AR
            magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockR, c_zero, blockAR );
 
            if (!restart) {
                // === compact P & AP as well
                magma_dcompactActive(m, n, blockP,  m, activeMask);
                magma_dcompactActive(m, n, blockAP, m, activeMask);
          
                /*
                // === make P orthogonal to X ?
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, cBlockSize, m,
                            c_one, blockX, m, blockP, m, c_zero, gramB(0,0), ldgram);
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, n,
                            c_mone, blockX, m, gramB(0,0), ldgram, c_one, blockP, m);

                // === make P orthogonal to R ?
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                            c_one, blockR, m, blockP, m, c_zero, gramB(0,0), ldgram);
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, cBlockSize, cBlockSize,
                            c_mone, blockR, m, gramB(0,0), ldgram, c_one, blockP, m);
                */

                // === Make P orthonormal & properly change AP (without multiplication by A)
                magma_dgegqr_gpu(ikind, m, cBlockSize, blockP, m, dwork, hwork, info );
                //magma_dorthomgs( m, cBlockSize, blockP );

                //magma_d_bspmv_tuned(m, cBlockSize, c_one, A, blockP, c_zero, blockAP );
                magma_dsetmatrix( cBlockSize, cBlockSize, hwork, cBlockSize, dwork, cBlockSize);


//                magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
  //                           m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);

            // replacement according to Stan
#if defined(PRECISION_s) || defined(PRECISION_d)
            magmablas_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, 
                        m, cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#else
            magma_dtrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaNonUnit, m, 
                            cBlockSize, c_one, dwork, cBlockSize, blockAP, m);
#endif
            }

            iter = max(1,iterationNumber-10- (int)(log(1.*cBlockSize)));
            double condestGmean = 0.;
            for(int i = 0; i<iterationNumber-iter+1; i++)
                condestGmean += condestGhistory[i];
            condestGmean = condestGmean / (iterationNumber-iter+1);

            if (restart)
                gramDim = n+cBlockSize;
            else
                gramDim = n+2*cBlockSize;

            /* --- The Raileight-Ritz method for [X R P] -----------------------
               [ X R P ]'  [AX  AR  AP] y = evalues [ X R P ]' [ X R P ], i.e.,
       
                      GramA                                 GramB
                / X'AX  X'AR  X'AP \                 / X'X  X'R  X'P \
               |  R'AX  R'AR  R'AP  | y   = evalues |  R'X  R'R  R'P  |
                \ P'AX  P'AR  P'AP /                 \ P'X  P'R  P'P /       
               -----------------------------------------------------------------   */

            // === assemble GramB; first, set it to I
            magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramB, ldgram);  // identity

            if (!restart) {
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                            c_one, blockP, m, blockX, m, c_zero, gramB(n+cBlockSize,0), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                            c_one, blockP, m, blockR, m, c_zero, gramB(n+cBlockSize,n), ldgram);
            }
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockR, m, blockX, m, c_zero, gramB(n,0), ldgram);

            // === get GramB from the GPU to the CPU and compute its eigenvalues only
            magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);
            lapackf77_dsyev("N", "L", &gramDim, h_gramB, &ldgram, gevalues, 
                            hwork, &lwork,
                            #if defined(PRECISION_z) || defined(PRECISION_c)
                            rwork, 
                            #endif
                            info);

            // === check stability criteria if we need to restart
            condestG = log10( gevalues[gramDim-1]/gevalues[0] ) + 1.;
            if ((condestG/condestGmean>2 && condestG>2) || condestG>8) {
                // Steepest descent restart for stability
                restart=1;
                printf("restart at step #%d\n", (int) iterationNumber);
            }

            // === assemble GramA; first, set it to I
            magmablas_dlaset(MagmaFull, ldgram, ldgram, c_zero, c_one, gramA, ldgram);  // identity

            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m,
                        c_one, blockR, m, blockAX, m, c_zero, gramA(n,0), ldgram);
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m,
                        c_one, blockR, m, blockAR, m, c_zero, gramA(n,n), ldgram);

            if (!restart) {
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, n, m, 
                            c_one, blockP, m, blockAX, m, c_zero, 
                            gramA(n+cBlockSize,0), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, 
                            c_one, blockP, m, blockAR, m, c_zero, 
                            gramA(n+cBlockSize,n), ldgram);
                magma_dgemm(MagmaConjTrans, MagmaNoTrans, cBlockSize, cBlockSize, m, 
                            c_one, blockP, m, blockAP, m, c_zero, 
                            gramA(n+cBlockSize,n+cBlockSize), ldgram);
            }

            /*
            // === Compute X' AX or just use the eigenvalues below ?
            magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                        c_one, blockX, m, blockAX, m, c_zero,
                        gramA(0,0), ldgram);
            */

            if (restart==0) {
                magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
            }
            else {
                gramDim = n+cBlockSize;
                magma_dgetmatrix(gramDim, gramDim, gramA, ldgram, gevectors, ldgram);
            }

            for(int k=0; k<n; k++)
                *gevectors(k,k) = MAGMA_D_MAKE(evalues[k], 0);

            // === the previous eigensolver destroyed what is in h_gramB => must copy it again
            magma_dgetmatrix(gramDim, gramDim, gramB, ldgram, h_gramB, ldgram);

            magma_int_t itype = 1;
            lapackf77_dsygvd(&itype, "V", "L", &gramDim, 
                             gevectors, &ldgram, h_gramB, &ldgram,
                             gevalues, hwork, &lwork, 
                             #if defined(PRECISION_z) || defined(PRECISION_c)
                             rwork, &lrwork,
                             #endif
                             iwork, &liwork, info);
 
            for(int k =0; k<n; k++)
                evalues[k] = gevalues[k];
            
            // === copy back the result to gramA on the GPU and use it for the updates
            magma_dsetmatrix(gramDim, gramDim, gevectors, ldgram, gramA, ldgram);

            if (restart == 0) {
                // === contribution from P to the new X (in new search direction P)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize, 
                            c_one, blockP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
                magmablas_swap(dwork, blockP);
 
                // === contribution from R to the new X (in new search direction P)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockR, m, gramA(n,0), ldgram, c_one, blockP, m);

                // === corresponding contribution from AP to the new AX (in AP)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockAP, m, gramA(n+cBlockSize,0), ldgram, c_zero, dwork, m);
                magmablas_swap(dwork, blockAP);

                // === corresponding contribution from AR to the new AX (in AP)
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockAR, m, gramA(n,0), ldgram, c_one, blockAP, m);
            }
            else {
                // === contribution from R (only) to the new X
                magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, cBlockSize,
                            c_one, blockR, m, gramA(n,0), ldgram, c_zero, blockP, m);

                // === corresponding contribution from AR (only) to the new AX
                magma_dgemm(MagmaNoTrans, MagmaNoTrans,m, n, cBlockSize,
                            c_one, blockAR, m, gramA(n,0), ldgram, c_zero, blockAP, m);
            }
            
            // === contribution from old X to the new X + the new search direction P
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                        c_one, blockX, m, gramA, ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockX);
            //magma_daxpy(m*n, c_one, blockP, 1, blockX, 1);
            magma_dlobpcg_maxpy( m, n, blockP, blockX );    

            
            // === corresponding contribution from old AX to new AX + AP
            magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                        c_one, blockAX, m, gramA, ldgram, c_zero, dwork, m);
            magmablas_swap(dwork, blockAX);
            //magma_daxpy(m*n, c_one, blockAP, 1, blockAX, 1);
            magma_dlobpcg_maxpy( m, n, blockAP, blockAX );    

            condestGhistory[iterationNumber+1]=condestG;
            if (verbosity==1) {
                // double res;
                // magma_dgetmatrix(1, 1, 
                //                  (double*)residualNorms(0, iterationNumber), 1, 
                //                  (double*)&res, 1);
                // 
                //  printf("Iteration %4d, CBS %4d, Residual: %10.7f\n",
                //         iterationNumber, cBlockSize, res);
                printf("%4d-%2d ", (int) iterationNumber, (int) cBlockSize); 
                magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
            }

            restart = 0;
        }   // === end for iterationNumber = 1,maxIterations =======================


    // fill solver info
    magma_device_sync(); tempo2=magma_wtime();
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    solver_par->numiter = iterationNumber;
    if( solver_par->numiter < solver_par->maxiter){
        solver_par->info = 0;
    }else if( solver_par->init_res > solver_par->final_res )
        solver_par->info = -2;
    else
        solver_par->info = -1;
    
    // =============================================================================
    // === postprocessing;
    // =============================================================================

    // === compute the real AX and corresponding eigenvalues
    magma_d_bspmv_tuned(m, n, c_one, A, blockX, c_zero, blockAX );
    magma_dgemm(MagmaConjTrans, MagmaNoTrans, n, n, m,
                c_one,  blockX, m, blockAX, m, c_zero, gramM, n);

    magma_dsyevd_gpu( MagmaVec, MagmaUpper,
                      n, gramM, n, gevalues, dwork, n, hwork, lwork, 
                      #if defined(PRECISION_z) || defined(PRECISION_c)
                      rwork, lrwork,
                      #endif
                      iwork, liwork, info );
   
    for(int k =0; k<n; k++)
        evalues[k] = gevalues[k];

    // === update X = X * evectors
    magmablas_swap(blockX, dwork);
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockX, m);

    // === update AX = AX * evectors to compute the final residual
    magmablas_swap(blockAX, dwork);
    magma_dgemm(MagmaNoTrans, MagmaNoTrans, m, n, n,
                c_one, dwork, m, gramM, n, c_zero, blockAX, m);

    // === compute R = AX - evalues X
    magmablas_dlacpy( MagmaUpperLower, m, n, blockAX, m, blockR, m);
    for(int i=0; i<n; i++)
        magma_daxpy(m, MAGMA_D_MAKE(-evalues[i], 0), blockX+i*m, 1, blockR+i*m, 1);

    // === residualNorms[iterationNumber] = || R ||    
    magmablas_dnrm2_cols(m, n, blockR, m, residualNorms(0, iterationNumber));

    // === restore blockX if needed
    if (blockX != origX)
        magmablas_dlacpy( MagmaUpperLower, m, n, blockX, m, origX, m);

    printf("Eigenvalues:\n");
    for(int i =0; i<n; i++)
        printf("%e  ", evalues[i]);
    printf("\n\n");

    printf("Final residuals:\n");
    magma_dprint_gpu(1, n, residualNorms(0, iterationNumber), 1);
    printf("\n\n");

    //=== Print residual history in a file for plotting ====
    double *hresidualNorms;
    magma_dmalloc_cpu(&hresidualNorms, (iterationNumber+1) * n);
    magma_dgetmatrix(n, iterationNumber, 
                     (double*)residualNorms, n, 
                     (double*)hresidualNorms, n);

    printf("Residuals are stored in file residualNorms\n");
    printf("Plot the residuals using: myplot \n");
    
    FILE *residuals_file;
    residuals_file = fopen("residualNorms", "w");
    for(int i =1; i<iterationNumber; i++) {
        for(int j = 0; j<n; j++)
            fprintf(residuals_file, "%f ", *hresidualNorms(j,i));
        fprintf(residuals_file, "\n");
    }
    fclose(residuals_file);
    magma_free_cpu(hresidualNorms);

    // === free work space
    magma_free(     residualNorms   );
    magma_free_cpu( condestGhistory );
    magma_free_cpu( gevalues        );
    magma_free_cpu( iwork           );

    magma_free_pinned( hW           );
    magma_free_pinned( gevectors    );
    magma_free_pinned( h_gramB      );

    magma_free(     gramM           );
    magma_free(     gramA           );
    magma_free(     gramB           );
    magma_free(  activeMask         );

    magma_free(     blockAX    );
    magma_free(     blockAR    );
    magma_free(     blockAP    );
    magma_free(     blockR    );
    magma_free(     blockP    );
    magma_free(     blockW    );
    magma_free(     dwork    );   
    magma_free(     eval_gpu    );    

    magma_free_pinned( hwork    );


    #if defined(PRECISION_z) || defined(PRECISION_c)
    magma_free_cpu( rwork           );
    #endif

    return MAGMA_SUCCESS;
}
예제 #23
0
파일: dgehrd2.cpp 프로젝트: soulsheng/magma
extern "C" magma_int_t
magma_dgehrd2(magma_int_t n, magma_int_t ilo, magma_int_t ihi,
              double *a, magma_int_t lda,
              double *tau, double *work,
              magma_int_t lwork, magma_int_t *info)
{
/*  -- MAGMA (version 1.4.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       August 2013

    Purpose
    =======
    DGEHRD2 reduces a DOUBLE_PRECISION general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H .

    Arguments
    =========
    N       (input) INTEGER
            The order of the matrix A.  N >= 0.

    ILO     (input) INTEGER
    IHI     (input) INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to DGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    A       (input/output) DOUBLE_PRECISION array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    LDA     (input) INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    TAU     (output) DOUBLE_PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    WORK    (workspace/output) DOUBLE_PRECISION array, dimension (LWORK)
            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.

    LWORK   (input) INTEGER
            The length of the array WORK.  LWORK >= max(1,N).
            For optimum performance LWORK >= N*NB, where NB is the
            optimal blocksize.

            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ===============
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

       Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    =====================================================================    */


    double c_one = MAGMA_D_ONE;
    double c_zero = MAGMA_D_ZERO;

    magma_int_t nb = magma_get_dgehrd_nb(n);
    magma_int_t N = n, ldda = n;

    magma_int_t ib;
    magma_int_t nh, iws;
    magma_int_t nbmin, iinfo;
    magma_int_t ldwork;
    magma_int_t lquery;

    --tau;

    *info = 0;
    MAGMA_D_SET2REAL( work[0], (double) n * nb );

    lquery = lwork == -1;
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    /* Quick return if possible */
    nh = ihi - ilo + 1;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }

    double *da;
    if (MAGMA_SUCCESS != magma_dmalloc( &da, N*ldda + 2*N*nb + nb*nb )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }
    
    double *d_A    = da;
    double *d_work = da + (N+nb)*ldda;

    magma_int_t i__;

    double *t, *d_t;
    magma_dmalloc_cpu( &t, nb*nb );
    if ( t == NULL ) {
        magma_free( da );
        *info = MAGMA_ERR_HOST_ALLOC;
        return *info;
    }
    d_t = d_work + nb * ldda;

    dzero_nbxnb_block(nb, d_A+N*ldda, ldda);

    /* Set elements 1:ILO-1 and IHI:N-1 of TAU to zero */
    for (i__ = 1; i__ < ilo; ++i__)
        tau[i__] = c_zero;

    for (i__ = max(1,ihi); i__ < n; ++i__)
        tau[i__] = c_zero;

    for(i__=0; i__< nb*nb; i__+=4)
        t[i__] = t[i__+1] = t[i__+2] = t[i__+3] = c_zero;

    nbmin = 2;
    iws = 1;
    if (nb > 1 && nb < nh) {
        /*  Determine when to cross over from blocked to unblocked code
            (last block is always handled by unblocked code)              */
        if (nb < nh) {
            /* Determine if workspace is large enough for blocked code      */
            iws = n * nb;
            if (lwork < iws) {
                /*    Not enough workspace to use optimal NB:  determine the
                      minimum value of NB, and reduce NB or force use of
                      unblocked code                                          */
                nbmin = nb;
                if (lwork >= n * nbmin)
                    nb = lwork / n;
                else
                    nb = 1;
            }
        }
    }
    ldwork = n;

    if (nb < nbmin || nb >= nh) {
        /* Use unblocked code below */
        i__ = ilo;
    }
    else {
        /* Use blocked code */
        /* Copy the matrix to the GPU */
        magma_dsetmatrix( N, N-ilo+1, a+(ilo-1)*(lda), lda, d_A, ldda );
        
        for (i__ = ilo; i__ < ihi - nb; i__ += nb) {
            /* Computing MIN */
            ib = min(nb, ihi - i__);
            
            /*   Reduce columns i:i+ib-1 to Hessenberg form, returning the
                 matrices V and T of the block reflector H = I - V*T*V'
                 which performs the reduction, and also the matrix Y = A*V*T */
            
            /*   Get the current panel (no need for the 1st iteration) */
            magma_dgetmatrix( ihi-i__+1, ib,
                              d_A + (i__ - ilo)*ldda + i__ - 1, ldda,
                              a   + (i__ -  1 )*lda  + i__ - 1, lda );
            
            magma_dlahr2(ihi, i__, ib,
                         d_A + (i__ - ilo)*ldda,
                         d_A + N*ldda + 1,
                         a   + (i__ -   1 )*(lda) , lda,
                         &tau[i__], t, nb, work, ldwork);
            
            /* Copy T from the CPU to D_T on the GPU */
            magma_dsetmatrix( nb, nb, t, nb, d_t, nb );
            
            magma_dlahru(n, ihi, i__ - 1, ib,
                         a   + (i__ -  1 )*(lda), lda,
                         d_A + (i__ - ilo)*ldda,
                         d_A + (i__ - ilo)*ldda + i__ - 1,
                         d_A + N*ldda, d_t, d_work);
        }
    }

    /* Use unblocked code to reduce the rest of the matrix */
    if (!(nb < nbmin || nb >= nh)) {
        magma_dgetmatrix( n, n-i__+1,
                          d_A+ (i__-ilo)*ldda, ldda,
                          a  + (i__-1)*(lda),  lda );
    }
    lapackf77_dgehd2(&n, &i__, &ihi, a, &lda, &tau[1], work, &iinfo);
    MAGMA_D_SET2REAL( work[0], (double) iws );
    
    magma_free( da );
    magma_free_cpu(t);

    return *info;
} /* magma_dgehrd2 */
예제 #24
0
extern "C" magma_int_t
magma_dgetrf_mgpu(magma_int_t num_gpus, 
                 magma_int_t m, magma_int_t n, 
                 double **d_lA, magma_int_t ldda,
                 magma_int_t *ipiv, magma_int_t *info)
{
/*  -- MAGMA (version 1.3.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2012

    Purpose
    =======

    DGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Arguments
    =========

    NUM_GPUS 
            (input) INTEGER
            The number of GPUS to be used for the factorization.

    M       (input) INTEGER
            The number of rows of the matrix A.  M >= 0.

    N       (input) INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       (input/output) DOUBLE_PRECISION array on the GPU, dimension (LDDA,N).
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    LDDA     (input) INTEGER
            The leading dimension of the array A.  LDDA >= max(1,M).

    IPIV    (output) INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    INFO    (output) INTEGER
            = 0:  successful exit
            < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.
    =====================================================================    */

#define inAT(id,i,j) (d_lAT[(id)] + (i)*nb*lddat + (j)*nb)

    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;

    magma_int_t iinfo, nb, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, rows, cols, s, lddat, lddwork;
    magma_int_t id, i_local, i_local2, nb0, nb1;
    double *d_lAT[MagmaMaxGPUs];
    double *d_panel[MagmaMaxGPUs], *work;
    cudaStream_t streaml[4][2];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ldda < max(1,m))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    nb     = magma_get_dgetrf_nb(m);

    if (nb <= 1 || nb >= n) {
          /* Use CPU code. */
          magma_dmalloc_cpu( &work, m * n );
          if ( work == NULL ) {
              *info = MAGMA_ERR_HOST_ALLOC;
              return *info;
          }
          magma_dgetmatrix( m, n, d_lA[0], ldda, work, m );
          lapackf77_dgetrf(&m, &n, work, &m, ipiv, info);
          magma_dsetmatrix( m, n, work, m, d_lA[0], ldda );
          magma_free_cpu(work);
    } else {
          /* Use hybrid blocked code. */
          maxm = ((m + 31)/32)*32;
          if( num_gpus > ceil((double)n/nb) ) {
            printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) num_gpus );
            *info = -1;
            return *info;
          }

          /* allocate workspace for each GPU */
          lddat = ((((((n+nb-1)/nb)/num_gpus)*nb)+31)/32)*32;
          lddat = (n+nb-1)/nb;                 /* number of block columns         */
          lddat = (lddat+num_gpus-1)/num_gpus; /* number of block columns per GPU */
          lddat = nb*lddat;                    /* number of columns per GPU       */
          lddat = ((lddat+31)/32)*32;          /* make it a multiple of 32        */
          for(i=0; i<num_gpus; i++){
            magma_setdevice(i);

            /* local-n and local-ld */
            n_local[i] = ((n/nb)/num_gpus)*nb;
            if (i < (n/nb)%num_gpus)
               n_local[i] += nb;
            else if (i == (n/nb)%num_gpus)
               n_local[i] += n%nb;

            /* workspaces */
            if (MAGMA_SUCCESS != magma_dmalloc( &d_panel[i], 3*nb*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                    magma_free( d_lAT[j]   );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            /* local-matrix storage */
            if (MAGMA_SUCCESS != magma_dmalloc( &d_lAT[i], lddat*maxm )) {
                for( j=0; j<=i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_panel[j] );
                }
                for( j=0; j<i; j++ ) {
                    magma_setdevice(j);
                    magma_free( d_lAT[j] );
                }
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            }

            /* create the streams */
            magma_queue_create( &streaml[i][0] );
            magma_queue_create( &streaml[i][1] );

            magmablasSetKernelStream(streaml[i][1]);
            magmablas_dtranspose2( d_lAT[i], lddat, d_lA[i], ldda, m, n_local[i] );
          }
          for(i=0; i<num_gpus; i++){
            magma_setdevice(i);
            cudaStreamSynchronize(streaml[i][0]);
            magmablasSetKernelStream(NULL);
          }
          magma_setdevice(0);

          /* cpu workspace */
          lddwork = maxm;
          if (MAGMA_SUCCESS != magma_dmalloc_pinned( &work, lddwork*nb*num_gpus )) {
              for(i=0; i<num_gpus; i++ ) {
                  magma_setdevice(i);
                  magma_free( d_panel[i] );
                  magma_free( d_lAT[i]   );
              }
              *info = MAGMA_ERR_HOST_ALLOC;
              return *info;
          }

          /* calling multi-gpu interface with allocated workspaces and streams */
          //magma_dgetrf1_mgpu( num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
          //                   (cudaStream_t **)streaml, info );
          magma_dgetrf2_mgpu(num_gpus, m, n, nb, 0, d_lAT, lddat, ipiv, d_panel, work, maxm,
                             streaml, info);

          /* clean up */
          for( d=0; d<num_gpus; d++ ) {
              magma_setdevice(d);
              
              /* save on output */
              magmablas_dtranspose2( d_lA[d], ldda, d_lAT[d], lddat, n_local[d], m );
              magma_device_sync();
              magma_free( d_lAT[d]   );
              magma_free( d_panel[d] );
              magma_queue_destroy( streaml[d][0] );
              magma_queue_destroy( streaml[d][1] );
              magmablasSetKernelStream(NULL);
          } /* end of for d=1,..,num_gpus */
          magma_setdevice(0);
          magma_free_pinned( work );
        }
        
        return *info;       
        /* End of MAGMA_DGETRF_MGPU */
}
예제 #25
0
/**
    Purpose
    -------
    DGEHRD reduces a DOUBLE PRECISION general matrix A to upper Hessenberg form H by
    an orthogonal similarity transformation:  Q' * A * Q = H . This version
    stores the triangular matrices used in the factorization so that they can
    be applied directly (i.e., without being recomputed) later. As a result,
    the application of Q is much faster.

    Arguments
    ---------
    @param[in]
    n       INTEGER
            The order of the matrix A.  N >= 0.

    @param[in]
    ilo     INTEGER
    @param[in]
    ihi     INTEGER
            It is assumed that A is already upper triangular in rows
            and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally
            set by a previous call to DGEBAL; otherwise they should be
            set to 1 and N respectively. See Further Details.
            1 <= ILO <= IHI <= N, if N > 0; ILO=1 and IHI=0, if N=0.

    @param[in,out]
    A       DOUBLE PRECISION array, dimension (LDA,N)
            On entry, the N-by-N general matrix to be reduced.
            On exit, the upper triangle and the first subdiagonal of A
            are overwritten with the upper Hessenberg matrix H, and the
            elements below the first subdiagonal, with the array TAU,
            represent the orthogonal matrix Q as a product of elementary
            reflectors. See Further Details.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    @param[out]
    tau     DOUBLE PRECISION array, dimension (N-1)
            The scalar factors of the elementary reflectors (see Further
            Details). Elements 1:ILO-1 and IHI:N-1 of TAU are set to
            zero.

    @param[out]
    work    (workspace) DOUBLE PRECISION array, dimension (LWORK)
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    @param[in]
    lwork   INTEGER
            The length of the array WORK.  LWORK >= N*NB,
            where NB is the optimal blocksize.
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued by XERBLA.

    @param[out]
    dT      DOUBLE PRECISION array on the GPU, dimension NB*N,
            where NB is the optimal blocksize. It stores the NB*NB blocks
            of the triangular T matrices used in the reduction.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.

    Further Details
    ---------------
    The matrix Q is represented as a product of (ihi-ilo) elementary
    reflectors

       Q = H(ilo) H(ilo+1) . . . H(ihi-1).

    Each H(i) has the form

       H(i) = I - tau * v * v'

    where tau is a real scalar, and v is a real vector with
    v(1:i) = 0, v(i+1) = 1 and v(ihi+1:n) = 0; v(i+2:ihi) is stored on
    exit in A(i+2:ihi,i), and tau in TAU(i).

    The contents of A are illustrated by the following example, with
    n = 7, ilo = 2 and ihi = 6:

    @verbatim
    on entry,                        on exit,

    ( a   a   a   a   a   a   a )    (  a   a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      a   h   h   h   h   a )
    (     a   a   a   a   a   a )    (      h   h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  h   h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  h   h   h   h )
    (     a   a   a   a   a   a )    (      v2  v3  v4  h   h   h )
    (                         a )    (                          a )
    @endverbatim

    where a denotes an element of the original matrix A, h denotes a
    modified element of the upper Hessenberg matrix H, and vi denotes an
    element of the vector defining H(i).

    This implementation follows the hybrid algorithm and notations described in

    S. Tomov and J. Dongarra, "Accelerating the reduction to upper Hessenberg
    form through hybrid GPU-based computing," University of Tennessee Computer
    Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219),
    May 24, 2009.
    
    This version stores the T matrices in dT, for later use in magma_dorghr.

    @ingroup magma_dgeev_comp
    ********************************************************************/
extern "C" magma_int_t
magma_dgehrd(
    magma_int_t n, magma_int_t ilo, magma_int_t ihi,
    double *A, magma_int_t lda,
    double *tau,
    double *work, magma_int_t lwork,
    magmaDouble_ptr dT,
    magma_int_t *info)
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda)

    #ifdef HAVE_clBLAS
    #define dA(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda*2)
    #define dT(i_,j_)  dT,    ((i_) + (j_)*nb   + dT_offset)
    #define dV(i_,j_)  dwork, ((i_) + (j_)*ldda + nb*ldda)
    #define dwork(i_)  dwork, ((i_))
    #else
    #define dA(i_,j_) (dA    + (i_) + (j_)*ldda)
    #define dT(i_,j_) (dT    + (i_) + (j_)*nb)
    #define dV(i_,j_) (dV    + (i_) + (j_)*ldda)
    #define dwork(i_) (dwork + (i_))
    #endif

    // Constants
    const double c_one  = MAGMA_D_ONE;
    const double c_zero = MAGMA_D_ZERO;

    // Local variables
    magma_int_t nb = magma_get_dgehrd_nb( n );
    magma_int_t ldda = magma_roundup( n, 32 );

    magma_int_t i, nh, iws;
    magma_int_t iinfo;
    magma_int_t lquery;

    *info = 0;
    iws = n*nb;
    work[0] = magma_dmake_lwork( iws );

    lquery = (lwork == -1);
    if (n < 0) {
        *info = -1;
    } else if (ilo < 1 || ilo > max(1,n)) {
        *info = -2;
    } else if (ihi < min(ilo,n) || ihi > n) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;
    } else if (lwork < iws && ! lquery) {
        *info = -8;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery)
        return *info;

    // Adjust from 1-based indexing
    ilo -= 1;
    
    // Quick return if possible
    nh = ihi - ilo;
    if (nh <= 1) {
        work[0] = c_one;
        return *info;
    }

    // Now requires lwork >= iws; else dT won't be computed in unblocked code.
    // If not enough workspace, use unblocked code
    //if ( lwork < iws ) {
    //    nb = 1;
    //}

    if (nb == 1 || nb > nh) {
        // Use unblocked code below
        i = ilo;
    }
    else {
        // Use blocked code
        magma_queue_t queue;
        magma_device_t cdev;
        magma_getdevice( &cdev );
        magma_queue_create( cdev, &queue );
        
        // GPU workspace is:
        //   nb*ldda for dwork for dlahru
        //   nb*ldda for dV
        //   n*ldda  for dA
        magmaDouble_ptr dwork;
        if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 2*nb*ldda + n*ldda )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;
        }
        double *dV = dwork + nb*ldda;
        double *dA = dwork + nb*ldda*2;
        
        double *T;
        magma_dmalloc_cpu( &T, nb*nb );
        if ( T == NULL ) {
            magma_free( dwork );
            *info = MAGMA_ERR_HOST_ALLOC;
            return *info;
        }
        
        // zero first block of V, which is lower triangular
        magmablas_dlaset( MagmaFull, nb, nb, c_zero, c_zero, dV(0,0), ldda, queue );
        
        // Set elements 0:ILO-1 and IHI-1:N-2 of TAU to zero
        for (i = 0; i < ilo; ++i)
            tau[i] = c_zero;
        
        for (i = max(0,ihi-1); i < n-1; ++i)
            tau[i] = c_zero;
        
        assert( nb % 4 == 0 );
        for (i=0; i < nb*nb; i += 4)
            T[i] = T[i+1] = T[i+2] = T[i+3] = c_zero;
        
        magmablas_dlaset( MagmaFull, nb, n, c_zero, c_zero, dT(0,0), nb, queue );
        
        // Copy the matrix to the GPU
        magma_dsetmatrix( n, n-ilo, A(0,ilo), lda, dA(0,0), ldda, queue );
        
        for (i = ilo; i < ihi-1 - nb; i += nb) {
            // Reduce columns i:i+nb-1 to Hessenberg form, returning the
            // matrices V and T of the block reflector H = I - V*T*V'
            // which performs the reduction, and also the matrix Y = A*V*T
            
            // Get the current panel (no need for the 1st iteration)
            magma_dgetmatrix( ihi-i, nb,
                              dA(i,i-ilo), ldda,
                              A(i,i), lda, queue );
            
            // add 1 to i for 1-based index
            magma_dlahr2( ihi, i+1, nb,
                          dA(0,i-ilo), ldda,
                          dV(0,0),     ldda,
                          A(0,i),      lda,
                          &tau[i], T, nb, work, n, queue );
            
            // Copy T from the CPU to dT on the GPU
            magma_dsetmatrix( nb, nb, T, nb, dT(0,i-ilo), nb, queue );
            
            magma_dlahru( n, ihi, i, nb,
                          A(0,i),      lda,
                          dA(0,i-ilo), ldda, // dA
                          dA(i,i-ilo), ldda, // dY, stored over current panel
                          dV(0,0),     ldda,
                          dT(0,i-ilo), dwork(0), queue );
        }
        
        // Copy remainder to host
        magma_dgetmatrix( n, n-i,
                          dA(0,i-ilo), ldda,
                          A(0,i), lda, queue );
        
        magma_free( dwork );
        magma_free_cpu( T );
        
        magma_queue_destroy( queue );
    }

    // Use unblocked code to reduce the rest of the matrix
    // add 1 to i for 1-based index
    i += 1;
    lapackf77_dgehd2(&n, &i, &ihi, A, &lda, tau, work, &iinfo);
    work[0] = magma_dmake_lwork( iws );

    return *info;
} /* magma_dgehrd */
예제 #26
0
extern "C" magma_int_t
magma_dmlumerge(    
    magma_d_sparse_matrix L, 
    magma_d_sparse_matrix U,
    magma_d_sparse_matrix *A,
    magma_queue_t queue ){
    if( L.storage_type == Magma_CSR && U.storage_type == Magma_CSR ){
        if( L.memory_location == Magma_CPU && U.memory_location == Magma_CPU ){
            
            magma_d_mtransfer( L, A, Magma_CPU, Magma_CPU, queue );
            magma_free_cpu( A->col );
            magma_free_cpu( A->val );
            // make sure it is strictly lower triangular
            magma_int_t z = 0;
            for(magma_int_t i=0; i<A->num_rows; i++){
                for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){
                    if( L.col[j] < i ){// make sure it is strictly lower triangular
                        z++;
                    }
                }
                for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){
                    z++;
                }
            }
            A->nnz = z;
            // fill A with the new structure;
            magma_int_t stat_cpu = 0;
            stat_cpu += magma_index_malloc_cpu( &A->col, A->nnz );
            stat_cpu += magma_dmalloc_cpu( &A->val, A->nnz );
            if( stat_cpu != 0 ){
                magma_d_mfree( A, queue );
                printf("error: memory allocation.\n");
                return MAGMA_ERR_HOST_ALLOC;
            }
            z = 0;
            for(magma_int_t i=0; i<A->num_rows; i++){
                A->row[i] = z;
                for(magma_int_t j=L.row[i]; j<L.row[i+1]; j++){
                    if( L.col[j] < i ){// make sure it is strictly lower triangular
                        A->col[z] = L.col[j];
                        A->val[z] = L.val[j];
                        z++;
                    }
                }
                for(magma_int_t j=U.row[i]; j<U.row[i+1]; j++){
                    A->col[z] = U.col[j];
                    A->val[z] = U.val[j];
                    z++;
                }
            }
            A->row[A->num_rows] = z;
            A->nnz = z;
            return MAGMA_SUCCESS; 
        }
        else{
    
            printf("error: matrix not on CPU.\n"); 
    
            return MAGMA_SUCCESS; 
        }
    }
    else{
        
        printf("error: matrix not on CPU.\n"); 
    
        return MAGMA_SUCCESS; 
    }
}
예제 #27
0
파일: dorgqr_m.cpp 프로젝트: maxhutch/magma
/***************************************************************************//**
    Purpose
    -------
    DORGQR generates an M-by-N DOUBLE PRECISION matrix Q with orthonormal columns,
    which is defined as the first N columns of a product of K elementary
    reflectors of order M

        Q  =  H(1) H(2) . . . H(k)

    as returned by DGEQRF.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix Q. M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix Q. M >= N >= 0.

    @param[in]
    k       INTEGER
            The number of elementary reflectors whose product defines the
            matrix Q. N >= K >= 0.

    @param[in,out]
    A       DOUBLE PRECISION array A, dimension (LDDA,N).
            On entry, the i-th column must contain the vector
            which defines the elementary reflector H(i), for
            i = 1,2,...,k, as returned by DGEQRF_GPU in the
            first k columns of its array argument A.
            On exit, the M-by-N matrix Q.

    @param[in]
    lda     INTEGER
            The first dimension of the array A. LDA >= max(1,M).

    @param[in]
    tau     DOUBLE PRECISION array, dimension (K)
            TAU(i) must contain the scalar factor of the elementary
            reflector H(i), as returned by DGEQRF_GPU.

    @param[in]
    T       DOUBLE PRECISION array, dimension (NB, min(M,N)).
            T contains the T matrices used in blocking the elementary
            reflectors H(i), e.g., this can be the 6th argument of
            magma_dgeqrf_gpu (except stored on the CPU, not the GPU).

    @param[in]
    nb      INTEGER
            This is the block size used in DGEQRF_GPU, and correspondingly
            the size of the T matrices, used in the factorization, and
            stored in T.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value

    @ingroup magma_ungqr
*******************************************************************************/
extern "C" magma_int_t
magma_dorgqr_m(
    magma_int_t m, magma_int_t n, magma_int_t k,
    double *A, magma_int_t lda,
    double *tau,
    double *T, magma_int_t nb,
    magma_int_t *info)
{
#define  A(i,j)   ( A    + (i) + (j)*lda )
#define dA(d,i,j) (dA[d] + (i) + (j)*ldda)
#define dT(d,i,j) (dT[d] + (i) + (j)*nb)

    double c_zero = MAGMA_D_ZERO;
    double c_one  = MAGMA_D_ONE;

    magma_int_t m_kk, n_kk, k_kk, mi;
    magma_int_t lwork, ldwork;
    magma_int_t d, i, ib, j, jb, ki, kk;
    double *work=NULL;

    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if ((n < 0) || (n > m)) {
        *info = -2;
    } else if ((k < 0) || (k > n)) {
        *info = -3;
    } else if (lda < max(1,m)) {
        *info = -5;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }

    if (n <= 0) {
        return *info;
    }
    
    magma_int_t di, dn;
    magma_int_t dpanel;

    magma_int_t ngpu = magma_num_gpus();
    
    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    
    // Allocate memory on GPUs for A and workspaces
    magma_int_t ldda    = magma_roundup( m, 32 );
    magma_int_t lddwork = magma_roundup( n, 32 );
    magma_int_t min_lblocks = (n / nb) / ngpu;  // min. blocks per gpu
    magma_int_t last_dev    = (n / nb) % ngpu;  // device with last block
    
    magma_int_t  nlocal[ MagmaMaxGPUs ] = { 0 };
    double *dA[ MagmaMaxGPUs ] = { NULL };
    double *dT[ MagmaMaxGPUs ] = { NULL };
    double *dV[ MagmaMaxGPUs ] = { NULL };
    double *dW[ MagmaMaxGPUs ] = { NULL };
    magma_queue_t queues[ MagmaMaxGPUs ] = { NULL };
    
    for( d = 0; d < ngpu; ++d ) {
        // example with n = 75, nb = 10, ngpu = 3
        // min_lblocks = 2
        // last_dev    = 1
        // gpu 0: 2  blocks, cols:  0- 9, 30-39, 60-69
        // gpu 1: 1+ blocks, cols: 10-19, 40-49, 70-74 (partial)
        // gpu 2: 1  block,  cols: 20-29, 50-59
        magma_setdevice( d );
        nlocal[d] = min_lblocks*nb;
        if ( d < last_dev ) {
            nlocal[d] += nb;
        }
        else if ( d == last_dev ) {
            nlocal[d] += (n % nb);
        }
        
        ldwork = nlocal[d]*ldda  // dA
               + nb*m            // dT
               + nb*ldda         // dV
               + nb*lddwork;     // dW
        if ( MAGMA_SUCCESS != magma_dmalloc( &dA[d], ldwork )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            goto cleanup;
        }
        dT[d] = dA[d] + nlocal[d]*ldda;
        dV[d] = dT[d] + nb*m;
        dW[d] = dV[d] + nb*ldda;
        
        magma_queue_create( d, &queues[d] );
    }
    
    trace_init( 1, ngpu, 1, queues );
    
    // first kk columns are handled by blocked method.
    // ki is start of 2nd-to-last block
    if ((nb > 1) && (nb < k)) {
        ki = (k - nb - 1) / nb * nb;
        kk = min(k, ki + nb);
    } else {
        ki = 0;
        kk = 0;
    }

    // Allocate CPU work space
    // n*nb  for larfb work
    // m*nb  for V
    // nb*nb for T
    lwork = (n + m + nb) * nb;
    magma_dmalloc_cpu( &work, lwork );
    if (work == NULL) {
        *info = MAGMA_ERR_HOST_ALLOC;
        goto cleanup;
    }
    double *work_T, *work_V;
    work_T = work + n*nb;
    work_V = work + n*nb + nb*nb;

    // Use unblocked code for the last or only block.
    if (kk < n) {
        trace_cpu_start( 0, "ungqr", "ungqr last block" );
        m_kk = m - kk;
        n_kk = n - kk;
        k_kk = k - kk;
        
        // dorgqr requires less workspace (n*nb), but is slow if k < dorgqr's block size.
        // replacing it with the 4 routines below is much faster (e.g., 60x).
        //magma_int_t iinfo;
        //lapackf77_dorgqr( &m_kk, &n_kk, &k_kk,
        //                  A(kk, kk), &lda,
        //                  &tau[kk], work, &lwork, &iinfo );
        
        lapackf77_dlacpy( MagmaFullStr, &m_kk, &k_kk, A(kk,kk), &lda, work_V, &m_kk);
        lapackf77_dlaset( MagmaFullStr, &m_kk, &n_kk, &c_zero, &c_one, A(kk, kk), &lda );
        
        lapackf77_dlarft( MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &k_kk,
                          work_V, &m_kk, &tau[kk], work_T, &k_kk);
        lapackf77_dlarfb( MagmaLeftStr, MagmaNoTransStr, MagmaForwardStr, MagmaColumnwiseStr,
                          &m_kk, &n_kk, &k_kk,
                          work_V, &m_kk, work_T, &k_kk, A(kk, kk), &lda, work, &n_kk );
        
        if (kk > 0) {
            for( j=kk; j < n; j += nb ) {
                jb = min( n-j, nb );
                d  =  (j / nb) % ngpu;
                di = ((j / nb) / ngpu) * nb;
                magma_setdevice( d );
                magma_dsetmatrix( m_kk, jb,
                                  A(kk, j),  lda,
                                  dA(d, kk, di), ldda, queues[d] );
                
                // Set A(1:kk,kk+1:n) to zero.
                magmablas_dlaset( MagmaFull, kk, jb, c_zero, c_zero, dA(d, 0, di), ldda, queues[d] );
            }
        }
        trace_cpu_end( 0 );
    }

    if (kk > 0) {
        // Use blocked code
        // send T to all GPUs
        for( d = 0; d < ngpu; ++d ) {
            magma_setdevice( d );
            trace_gpu_start( d, 0, "set", "set T" );
            magma_dsetmatrix_async( nb, min(m,n), T, nb, dT[d], nb, queues[d] );
            trace_gpu_end( d, 0 );
        }
        
        // queue: set Aii (V) --> laset --> laset --> larfb --> [next]
        // CPU has no computation
        for( i = ki; i >= 0; i -= nb ) {
            ib = min(nb, k - i);
            mi = m - i;
            dpanel =  (i / nb) % ngpu;
            di     = ((i / nb) / ngpu) * nb;

            // Send current panel to dV on the GPUs
            lapackf77_dlaset( "Upper", &ib, &ib, &c_zero, &c_one, A(i, i), &lda );
            for( d = 0; d < ngpu; ++d ) {
                magma_setdevice( d );
                trace_gpu_start( d, 0, "set", "set V" );
                magma_dsetmatrix_async( mi, ib,
                                        A(i, i), lda,
                                        dV[d],   ldda, queues[d] );
                trace_gpu_end( d, 0 );
            }
            
            // set panel to identity
            magma_setdevice( dpanel );
            trace_gpu_start( dpanel, 0, "laset", "laset" );
            magmablas_dlaset( MagmaFull, i,  ib, c_zero, c_zero, dA(dpanel, 0, di), ldda, queues[dpanel] );
            magmablas_dlaset( MagmaFull, mi, ib, c_zero, c_one,  dA(dpanel, i, di), ldda, queues[dpanel] );
            trace_gpu_end( dpanel, 0 );
            
            if (i < n) {
                // Apply H to A(i:m,i:n) from the left
                for( d = 0; d < ngpu; ++d ) {
                    magma_setdevice( d );
                    magma_indices_1D_bcyclic( nb, ngpu, d, i, n, &di, &dn );
                    trace_gpu_start( d, 0, "larfb", "larfb" );
                    magma_dlarfb_gpu( MagmaLeft, MagmaNoTrans, MagmaForward, MagmaColumnwise,
                                      mi, dn-di, ib,
                                      dV[d],        ldda, dT(d,0,i), nb,
                                      dA(d, i, di), ldda, dW[d], lddwork, queues[d] );
                    trace_gpu_end( d, 0 );
                }
            }
        }
        
        // copy result back to CPU
        trace_cpu_start( 0, "get", "get A" );
        magma_dgetmatrix_1D_col_bcyclic( ngpu, m, n, nb, dA, ldda, A, lda, queues );
        trace_cpu_end( 0 );
    }
    
    #ifdef TRACING
    char name[80];
    snprintf( name, sizeof(name), "dorgqr-n%lld-ngpu%lld.svg", (long long) m, (long long) ngpu );
    trace_finalize( name, "trace.css" );
    #endif
    
cleanup:
    for( d = 0; d < ngpu; ++d ) {
        magma_setdevice( d );
        magma_free( dA[d] );
        magma_queue_destroy( queues[d] );
    }
    magma_free_cpu( work );
    magma_setdevice( orig_dev );
    
    return *info;
} /* magma_dorgqr */
예제 #28
0
int main( int argc, char** argv )
{
    magma_init();
    cublasHandle_t handle;
    cudaSetDevice( 0 );
    cublasCreate( &handle );
    
    double *A, *B, *C;
    double *dA, *dB, *dC;
    double error, work[1];
    double c_one     = MAGMA_D_ONE;
    double c_neg_one = MAGMA_D_NEG_ONE;
    magma_int_t ione = 1;
    magma_int_t ISEED[4] = { 1, 2, 3, 4 };
    magma_int_t n = 10;
    magma_int_t lda = n;
    magma_int_t ldda = ((n+31)/32)*32;
    magma_int_t size = lda*n;
    magma_int_t info;
    
    magma_dmalloc_cpu( &A, lda*n );
    magma_dmalloc_cpu( &B, lda*n );
    magma_dmalloc_cpu( &C, lda*n );
    magma_dmalloc( &dA, ldda*n );
    magma_dmalloc( &dB, ldda*n );
    magma_dmalloc( &dC, ldda*n );
    
    // initialize matrices
    lapackf77_dlarnv( &ione, ISEED, &size, A );
    lapackf77_dlarnv( &ione, ISEED, &size, B );
    lapackf77_dlarnv( &ione, ISEED, &size, C );
    // increase diagonal to be SPD
    for( int i=0; i < n; ++i ) {
        C[i+i*lda] = MAGMA_D_ADD( C[i+i*lda], MAGMA_D_MAKE( n*n, 0 ));
    }
    
    magma_dsetmatrix( n, n, A, lda, dA, ldda );
    magma_dsetmatrix( n, n, B, lda, dB, ldda );
    magma_dsetmatrix( n, n, C, lda, dC, ldda );
    
    // compute with cublas
    cublasDgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n,
                 &c_neg_one, dA, ldda, dB, ldda, &c_one, dC, ldda );
    
    magma_dpotrf_gpu( MagmaLower, n, dC, ldda, &info );
    if (info != 0)
        printf("magma_dpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute with LAPACK
    blasf77_dgemm( MagmaNoTransStr, MagmaNoTransStr, &n, &n, &n,
                   &c_neg_one, A, &lda, B, &lda, &c_one, C, &lda );
    
    lapackf77_dpotrf( MagmaLowerStr, &n, C, &lda, &info );
    if (info != 0)
        printf("lapackf77_dpotrf returned error %d: %s.\n",
               (int) info, magma_strerror( info ));
    
    // compute difference
    magma_dgetmatrix( n, n, dC, ldda, A, lda );
    blasf77_daxpy( &size, &c_neg_one, C, &ione, A, &ione );
    error = lapackf77_dlange( "F", &n, &n, A, &lda, work );
    printf( "n %d, error %8.2e\n", n, error );
    
    magma_free( dA );
    magma_free( dB );
    magma_free( dC );
    magma_free_cpu( A );
    magma_free_cpu( B );
    magma_free_cpu( C );
    
    cublasDestroy( handle );
    magma_finalize();
    return 0;
}
예제 #29
0
static void magma_dtile_bulge_applyQ(
    magma_int_t core_id, magma_side_t side, magma_int_t n_loc, magma_int_t n, magma_int_t nb, magma_int_t Vblksiz,
    double *E, magma_int_t lde,
    double *V, magma_int_t ldv,
    double *TAU, double *T, magma_int_t ldt)
    //, magma_int_t* info)
{
    //%===========================
    //%   local variables
    //%===========================
    magma_int_t firstcolj;
    magma_int_t bg, rownbm;
    magma_int_t st,ed,fst,vlen,vnb,colj;
    magma_int_t vpos,tpos;
    magma_int_t cur_blksiz,avai_blksiz, ncolinvolvd;
    magma_int_t nbgr, colst, coled;

    if (n <= 0)
        return;
    if (n_loc <= 0)
        return;

    //info = 0;
    magma_int_t INFO=0;

    magma_int_t nbGblk  = magma_ceildiv(n-1, Vblksiz);

    /*
     * version v1: for each chunck it apply all the V's then move to
     *                    the other chunck. the locality here inside each
     *                    chunck meaning that thread t apply V_k then move
     *                    to V_k+1 which overlap with V_k meaning that the
     *                    E_k+1 overlap with E_k. so here is the
     *                    locality however thread t had to read V_k+1 and
     *                    T_k+1 at each apply. note that all thread if they
     *                    run at same speed they might reading the same V_k
     *                    and T_k at the same time.
     * */

    magma_int_t nb_loc = 128; //$$$$$$$$

    magma_int_t     lwork = 2*nb_loc*max(Vblksiz,64);
    double *work, *work2;

    magma_dmalloc_cpu(&work, lwork);
    magma_dmalloc_cpu(&work2, lwork);

    magma_int_t nbchunk =  magma_ceildiv(n_loc, nb_loc);

    /* SIDE LEFT  meaning apply E = Q*E = (q_1*q_2*.....*q_n) * E ==> so traverse Vs in reverse order (forward) from q_n to q_1
     *            each q_i consist of applying V to a block of row E(row_i,:) and applies are overlapped meaning
     *            that q_i+1 overlap a portion of the E(row_i, :).
     *            IN parallel E is splitten in vertical block over the threads  */
    /* SIDE RIGHT meaning apply E = E*Q = E * (q_1*q_2*.....*q_n) ==> so tarverse Vs in normal  order (forward) from q_1 to q_n
     *            each q_i consist of applying V to a block of col E(:, col_i,:) and the applies are overlapped meaning
     *            that q_i+1 overlap a portion of the E(:, col_i).
     *            IN parallel E is splitten in horizontal block over the threads  */
    #ifdef ENABLE_DEBUG
    if ((core_id == 0) || (core_id == 1))
        printf("  APPLY Q2_cpu dbulge_back   N %d  N_loc %d  nbchunk %d  NB %d  Vblksiz %d  SIDE %c \n", n, n_loc, nbchunk, nb, Vblksiz, side);
    #endif
    for (magma_int_t i = 0; i < nbchunk; i++) {
        magma_int_t ib_loc = min(nb_loc, (n_loc - i*nb_loc));

        if (side == MagmaLeft) {
            for (bg = nbGblk; bg > 0; bg--) {
                firstcolj = (bg-1)*Vblksiz + 1;
                rownbm    = magma_ceildiv((n-(firstcolj+1)),nb);
                if (bg == nbGblk) rownbm = magma_ceildiv((n-(firstcolj)),nb);  // last blk has size=1 used for real to handle A(N,N-1)
                for (magma_int_t j = rownbm; j > 0; j--) {
                    vlen = 0;
                    vnb  = 0;
                    colj = (bg-1)*Vblksiz; // for k=0; I compute the fst and then can remove it from the loop
                    fst  = (rownbm -j)*nb+colj +1;
                    for (magma_int_t k=0; k < Vblksiz; k++) {
                        colj = (bg-1)*Vblksiz + k;
                        st   = (rownbm -j)*nb+colj +1;
                        ed   = min(st+nb-1,n-1);
                        if (st > ed)
                            break;
                        if ((st == ed) && (colj != n-2))
                            break;
                        vlen = ed-fst+1;
                        vnb = k+1;
                    }
                    colst     = (bg-1)*Vblksiz;
                    magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);

                    if ((vlen > 0) && (vnb > 0)) {
                        lapackf77_dlarfb( "L", "N", "F", "C", &vlen, &ib_loc, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(fst,i*nb_loc), &lde, work, &ib_loc);
                    }
                    if (INFO != 0)
                        printf("ERROR DORMQR INFO %d \n", (int) INFO);
                }
            }
        } else if (side == MagmaRight) {
            rownbm    = magma_ceildiv((n-1),nb);
            for (magma_int_t k = 1; k <= rownbm; k++) {
                ncolinvolvd = min(n-1, k*nb);
                avai_blksiz = min(Vblksiz,ncolinvolvd);
                nbgr = magma_ceildiv(ncolinvolvd,avai_blksiz);
                for (magma_int_t j = 1; j <= nbgr; j++) {
                    vlen = 0;
                    vnb  = 0;
                    cur_blksiz = min(ncolinvolvd-(j-1)*avai_blksiz, avai_blksiz);
                    colst = (j-1)*avai_blksiz;
                    coled = colst + cur_blksiz -1;
                    fst   = (rownbm -k)*nb+colst +1;
                    for (colj=colst; colj <= coled; colj++) {
                        st = (rownbm -k)*nb+colj +1;
                        ed = min(st+nb-1,n-1);
                        if (st > ed)
                            break;
                        if ((st == ed) && (colj != n-2))
                            break;
                        vlen = ed-fst+1;
                        vnb = vnb+1;
                    }
                    magma_bulge_findVTpos(n, nb, Vblksiz, colst, fst, ldv, ldt, &vpos, &tpos);
                    if ((vlen > 0) && (vnb > 0)) {
                        lapackf77_dlarfb( "R", "N", "F", "C", &ib_loc, &vlen, &vnb, V(vpos), &ldv, T(tpos), &ldt, E(i*nb_loc,fst), &lde, work, &ib_loc);
                    }
                }
            }
        } else {
            printf("ERROR SIDE %d \n",side);
        }
    } // END loop over the chunks

    magma_free_cpu(work);
    magma_free_cpu(work2);
}
예제 #30
0
magma_int_t
magma_dcg_merge( magma_d_sparse_matrix A, magma_d_vector b, magma_d_vector *x,  
           magma_d_solver_par *solver_par ){

    // prepare solver feedback
    solver_par->solver = Magma_CGMERGE;
    solver_par->numiter = 0;
    solver_par->info = 0; 

    // some useful variables
    double c_zero = MAGMA_D_ZERO, c_one = MAGMA_D_ONE;
    magma_int_t dofs = A.num_rows;

    // GPU stream
    magma_queue_t stream[2];
    magma_event_t event[1];
    magma_queue_create( &stream[0] );
    magma_queue_create( &stream[1] );
    magma_event_create( &event[0] );

    // GPU workspace
    magma_d_vector r, d, z;
    magma_d_vinit( &r, Magma_DEV, dofs, c_zero );
    magma_d_vinit( &d, Magma_DEV, dofs, c_zero );
    magma_d_vinit( &z, Magma_DEV, dofs, c_zero );
    
    double *d1, *d2, *skp;
    magma_dmalloc( &d1, dofs*(1) );
    magma_dmalloc( &d2, dofs*(1) );
    // array for the parameters
    magma_dmalloc( &skp, 6 );       // skp = [alpha|beta|gamma|rho|tmp1|tmp2]


    // solver variables
    double alpha, beta, gamma, rho, tmp1, *skp_h;
    double nom, nom0, r0, betanom, den;

    // solver setup
    magma_dscal( dofs, c_zero, x->val, 1) ;                     // x = 0
    magma_dcopy( dofs, b.val, 1, r.val, 1 );                    // r = b
    magma_dcopy( dofs, b.val, 1, d.val, 1 );                    // d = b
    nom0 = betanom = magma_dnrm2( dofs, r.val, 1 );               
    nom = nom0 * nom0;                                           // nom = r' * r
    magma_d_spmv( c_one, A, d, c_zero, z );                      // z = A d
    den = MAGMA_D_REAL( magma_ddot(dofs, d.val, 1, z.val, 1) ); // den = d'* z
    solver_par->init_res = nom0;
    
    // array on host for the parameters
    magma_dmalloc_cpu( &skp_h, 6 );

    alpha = rho = gamma = tmp1 = c_one; 
    beta =  magma_ddot(dofs, r.val, 1, r.val, 1);
    skp_h[0]=alpha; 
    skp_h[1]=beta; 
    skp_h[2]=gamma; 
    skp_h[3]=rho; 
    skp_h[4]=tmp1; 
    skp_h[5]=MAGMA_D_MAKE(nom, 0.0);

    magma_dsetvector( 6, skp_h, 1, skp, 1 );
    
    if ( (r0 = nom * solver_par->epsilon) < ATOLERANCE ) 
        r0 = ATOLERANCE;
    if ( nom < r0 )
        return MAGMA_SUCCESS;
    // check positive definite
    if (den <= 0.0) {
        printf("Operator A is not postive definite. (Ar,r) = %f\n", den);
        return -100;
    }
    
    //Chronometry
    real_Double_t tempo1, tempo2;
    magma_device_sync(); tempo1=magma_wtime();
    if( solver_par->verbose > 0 ){
        solver_par->res_vec[0] = (real_Double_t) nom0;
        solver_par->timing[0] = 0.0;
    }
    
    // start iteration
    for( solver_par->numiter= 1; solver_par->numiter<solver_par->maxiter; 
                                                    solver_par->numiter++ ){

        magmablasSetKernelStream(stream[0]);
        
        // computes SpMV and dot product
        magma_dcgmerge_spmv1(  A, d1, d2, d.val, z.val, skp ); 
            
        // updates x, r, computes scalars and updates d
        magma_dcgmerge_xrbeta( dofs, d1, d2, x->val, r.val, d.val, z.val, skp ); 

        // check stopping criterion (asynchronous copy)
        magma_dgetvector_async( 1 , skp+1, 1, 
                                                    skp_h+1, 1, stream[1] );
        betanom = sqrt(MAGMA_D_REAL(skp_h[1]));

        if( solver_par->verbose > 0 ){
            magma_device_sync(); tempo2=magma_wtime();
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }

        if (  betanom  < r0 ) {
            break;
        }

    } 
    magma_device_sync(); tempo2=magma_wtime();
    solver_par->runtime = (real_Double_t) tempo2-tempo1;
    double residual;
    magma_dresidual( A, b, *x, &residual );
    solver_par->iter_res = betanom;
    solver_par->final_res = residual;

    if( solver_par->numiter < solver_par->maxiter){
        solver_par->info = 0;
    }else if( solver_par->init_res > solver_par->final_res ){
        if( solver_par->verbose > 0 ){
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = -2;
    }
    else{
        if( solver_par->verbose > 0 ){
            if( (solver_par->numiter)%solver_par->verbose==0 ) {
                solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) betanom;
                solver_par->timing[(solver_par->numiter)/solver_par->verbose] 
                        = (real_Double_t) tempo2-tempo1;
            }
        }
        solver_par->info = -1;
    }
    magma_d_vfree(&r);
    magma_d_vfree(&z);
    magma_d_vfree(&d);

    magma_free( d1 );
    magma_free( d2 );
    magma_free( skp );
    magma_free_cpu( skp_h );

    return MAGMA_SUCCESS;
}   /* magma_dcg_merge */