int main()
{
    double *A, *B, *C;
    int i,j,r,max_threads,size;
    double alpha, beta;
    double s_initial, s_elapsed;
    
    printf("Intializing data for matrix multiplication C=A*B for matrix\n\n"
            " A(%i*%i) and matrix B(%i*%i)\n",M,P,P,N);
    alpha = 1.0;
    beta = 0.0;

    printf("Allocating memory for matrices aligned on 64-byte boundary for better performance \n\n");
    A = ( double *)mkl_malloc(M*P*sizeof( double ),64);
    B = ( double *)mkl_malloc(N*P*sizeof( double ),64);
    C = ( double *)mkl_malloc(M*N*sizeof( double ),64);
    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: can`t allocate memory for matrices.\n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf("Intializing matrix data\n\n");
    size = M*P;
    for (i = 0; i < size; ++i)
    {
        A[i] = ( double )(i+1);
    }
    size = N*P;
    for (i = 0; i < size; ++i)
    {
        B[i] = ( double )(i-1);
    }

    printf("Finding max number of threads can use for parallel runs \n\n");
    max_threads = mkl_get_max_threads();

    printf("Running from 1 to %i threads \n\n",max_threads);
    for (i = 1; i <= max_threads; ++i)
    {
        size = M*N;
        for (j = 0; j < size; ++j)
        {
            C[j] = 0.0;
        }

	    printf("Requesting to use %i threads \n\n",i); 
	    mkl_set_num_threads(i);

	    printf("Measuring performance of matrix product using dgemm function\n"
		    " via CBLAS interface on %i threads \n\n",i);
	    s_initial = dsecnd();
	    for (r = 0; r < LOOP_COUNT; ++r)
	    {
    		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, P, alpha, A, P, B, N, beta, C, N);
            // multiply matrices with cblas_dgemm;
	    }
	    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

	    printf("Matrix multiplication using dgemm completed \n"
		    " at %.5f milliseconds using %d threads \n\n",
		    (s_elapsed * 1000),i);
        printf("Output the result: \n");
        size = M*N;
        for (i = 0; i < size; ++i)
        {
            printf("%i\t",(int)C[i]);
            if (i % N == N - 1)
                printf("\n");
        }
    }

    printf("Dellocating memory\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    return 0;
}
int check_factorization(int M, int N, double *A1, double *A2, int LDA, double *Q)
{
    double Anorm, Rnorm;
    double alpha, beta;
    int info_factorization;
    int i,j;
    double eps;

    eps = LAPACKE_dlamch_work('e');

    double *Ql       = (double *)malloc(M*N*sizeof(double));
    double *Residual = (double *)malloc(M*N*sizeof(double));
    double *work              = (double *)malloc(max(M,N)*sizeof(double));

    alpha=1.0;
    beta=0.0;

    if (M >= N) {
        /* Extract the R */
        double *R = (double *)malloc(N*N*sizeof(double));
        memset((void*)R, 0, N*N*sizeof(double));
        LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,'u', M, N, A2, LDA, R, N);

        /* Perform Ql=Q*R */
        memset((void*)Ql, 0, M*N*sizeof(double));
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, N, (alpha), Q, LDA, R, N, (beta), Ql, M);
        free(R);
    }
    else {
        /* Extract the L */
        double *L = (double *)malloc(M*M*sizeof(double));
        memset((void*)L, 0, M*M*sizeof(double));
        LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,'l', M, N, A2, LDA, L, M);

    /* Perform Ql=LQ */
        memset((void*)Ql, 0, M*N*sizeof(double));
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, M, N, M, (alpha), L, M, Q, LDA, (beta), Ql, M);
        free(L);
    }

    /* Compute the Residual */
    for (i = 0; i < M; i++)
        for (j = 0 ; j < N; j++)
            Residual[j*M+i] = A1[j*LDA+i]-Ql[j*M+i];

    Rnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, Residual, M, work);
    Anorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, lapack_const(PlasmaInfNorm), M, N, A2, LDA, work);

    if (M >= N) {
        printf("============\n");
        printf("Checking the QR Factorization \n");
        printf("-- ||A-QR||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps));
    }
    else {
        printf("============\n");
        printf("Checking the LQ Factorization \n");
        printf("-- ||A-LQ||_oo/(||A||_oo.N.eps) = %e \n",Rnorm/(Anorm*N*eps));
    }

    if (isnan(Rnorm / (Anorm * N *eps)) || (Rnorm / (Anorm * N * eps) > 10.0) ) {
        printf("-- Factorization is suspicious ! \n");
        info_factorization = 1;
    }
    else {
        printf("-- Factorization is CORRECT ! \n");
        info_factorization = 0;
    }

    free(work); free(Ql); free(Residual);

    return info_factorization;
}
Beispiel #3
0
static void
CORE_dgetrf_rectil_rec(const PLASMA_desc A, int *IPIV, int *info,
                       double *pivot,
                       int thidx,  int thcnt,
                       int column, int width,
                       int ft,     int lt)
{
    int ld, jp, n1, n2, lm, tmpM, piv_sf;
    int ip, j, it, i, ldft;
    int max_i, max_it, thwin;
    double zone  = 1.0;
    double mzone = -1.0;
    double tmp1;
    double tmp2 = 0.;
    double pivval;
    double *Atop, *Atop2, *U, *L;
    double             abstmp1;
    int offset = A.i;

    ldft = BLKLDD(A, 0);
    Atop = A(0, 0) + column * ldft;

    if ( width > 1 ) {
        /* Assumption: N = min( M, N ); */
        n1 = width / 2;
        n2 = width - n1;

        Atop2 = Atop + n1 * ldft;

        CORE_dgetrf_rectil_rec( A, IPIV, info, pivot,
                                thidx, thcnt, column, n1, ft, lt );

        if ( *info != 0 )
            return;

        if (thidx == 0)
        {
            /* Swap to the right */
            int *lipiv = IPIV+column;
            int idxMax = column+n1;
            for (j = column; j < idxMax; ++j, ++lipiv) {
                ip = (*lipiv) - offset - 1;
                if ( ip != j )
                {
                    it = ip / A.mb;
                    i  = ip % A.mb;
                    ld = BLKLDD(A, it);
                    cblas_dswap(n2, Atop2                     + j, ldft,
                                    A(it, 0) + (column+n1)*ld + i, ld   );
                }
            }
            /* Trsm on the upper part */
            U = Atop2 + column;
            cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower,
                         CblasNoTrans, CblasUnit,
                         n1, n2, (zone),
                         Atop  + column, ldft,
                         U,              ldft );

            /* SIgnal to other threads that they can start update */
            CORE_dbarrier_thread( thidx, thcnt );
            pivval = *pivot;
            if ( pivval == 0.0 ) {
                *info = column+n1;
                return;
            } else {
                if ( fabs(pivval) >= sfmin ) {
                    piv_sf = 1;
                    pivval = 1.0 / pivval;
                } else {
                    piv_sf = 0;
                }
            }

            /* First tile */
            {
                L = Atop + column + n1;
                tmpM = min(ldft, A.m) - column - n1;

                /* Scale last column of L */
                if ( piv_sf ) {
                    cblas_dscal( tmpM, (pivval), L+(n1-1)*ldft, 1 );
                } else {
                    int i;
                    Atop2 = L+(n1-1)*ldft;
                    for( i=0; i < tmpM; i++, Atop2++)
                        *Atop2 = *Atop2 / pivval;
                }

                /* Apply the GEMM */
                cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                             tmpM, n2, n1,
                             (mzone), L,      ldft,
                                                 U,      ldft,
                             (zone),  U + n1, ldft );

                /* Search Max in first column of U+n1 */
                tmp2    = U[n1];
                max_it  = ft;
                max_i   = cblas_idamax( tmpM, U+n1, 1 ) + n1;
                tmp1    = U[max_i];
                abstmp1 = fabs(tmp1);
                max_i   += column;
            }
        }
        else
        {
            pivval = *pivot;
            if ( pivval == 0.0 ) {
                *info = column+n1;
                return;
            } else {
                if ( fabs(pivval) >= sfmin ) {
                    piv_sf = 1;
                    pivval = 1.0 / pivval;
                } else {
                    piv_sf = 0;
                }
            }

            ld = BLKLDD( A, ft );
            L  = A( ft, 0 ) + column * ld;
            lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;

            U = Atop2 + column;

            /* First tile */
            /* Scale last column of L */
            if ( piv_sf ) {
                cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 );
            } else {
                int i;
                Atop2 = L+(n1-1)*ld;
                for( i=0; i < lm; i++, Atop2++)
                    *Atop2 = *Atop2 / pivval;
            }

            /* Wait for pivoting and triangular solve to be finished
             * before to really start the update */
            CORE_dbarrier_thread( thidx, thcnt );

            /* Apply the GEMM */
            cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                         lm, n2, n1,
                         (mzone), L,         ld,
                                             U,         ldft,
                         (zone),  L + n1*ld, ld );

            /* Search Max in first column of L+n1*ld */
            max_it  = ft;
            max_i   = cblas_idamax( lm, L+n1*ld, 1 );
            tmp1    = L[n1*ld+max_i];
            abstmp1 = fabs(tmp1);
        }

        /* Update the other blocks */
        for( it = ft+1; it < lt; it++)
        {
            ld = BLKLDD( A, it );
            L  = A( it, 0 ) + column * ld;
            lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;

            /* Scale last column of L */
            if ( piv_sf ) {
                cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 );
            } else {
                int i;
                Atop2 = L+(n1-1)*ld;
                for( i=0; i < lm; i++, Atop2++)
                    *Atop2 = *Atop2 / pivval;
            }

            /* Apply the GEMM */
            cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                         lm, n2, n1,
                         (mzone), L,          ld,
                                             U,          ldft,
                         (zone),  L + n1*ld,  ld );

            /* Search the max on the first column of L+n1*ld */
            jp = cblas_idamax( lm, L+n1*ld, 1 );
            if ( fabs( L[n1*ld+jp] ) > abstmp1 ) {
                tmp1 = L[n1*ld+jp];
                abstmp1 = fabs(tmp1);
                max_i  = jp;
                max_it = it;
            }
        }

        jp = offset + max_it*A.mb + max_i;
        CORE_damax1_thread( tmp1, thidx, thcnt, &thwin,
                            &tmp2, pivot, jp + 1, IPIV + column + n1 );

        if ( thidx == 0 ) {
            U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */
        }
        if (thwin == thidx) { /* the thread that owns the best pivot */
            if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */
            {
                ld = BLKLDD(A, max_it);
                Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i;
                *Atop2 = tmp2;
            }
        }

        CORE_dgetrf_rectil_rec( A, IPIV, info, pivot,
                                thidx, thcnt, column+n1, n2, ft, lt );
        if ( *info != 0 )
            return;

        if ( thidx == 0 )
        {
            /* Swap to the left */
            int *lipiv = IPIV+column+n1;
            int idxMax = column+width;
            for (j = column+n1; j < idxMax; ++j, ++lipiv) {
                ip = (*lipiv) - offset - 1;
                if ( ip != j )
                {
                    it = ip / A.mb;
                    i  = ip % A.mb;
                    ld = BLKLDD(A, it);
                    cblas_dswap(n1, Atop + j,                 ldft,
                                    A(it, 0) + column*ld + i, ld  );
                }
            }
        }

    } else if ( width == 1 ) {

        /* Search maximum for column 0 */
        if ( column == 0 )
        {
            if ( thidx == 0 )
              tmp2 = Atop[column];

            /* First tmp1 */
            ld = BLKLDD(A, ft);
            Atop2   = A( ft, 0 );
            lm      = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
            max_it  = ft;
            max_i   = cblas_idamax( lm, Atop2, 1 );
            tmp1    = Atop2[max_i];
            abstmp1 = fabs(tmp1);

            /* Update */
            for( it = ft+1; it < lt; it++)
            {
                Atop2= A( it, 0 );
                lm   = it == A.mt-1 ? A.m - it * A.mb : A.mb;
                jp   = cblas_idamax( lm, Atop2, 1 );
                if (  fabs(Atop2[jp]) > abstmp1 ) {
                    tmp1 = Atop2[jp];
                    abstmp1 = fabs(tmp1);
                    max_i  = jp;
                    max_it = it;
                }
            }

            jp = offset + max_it*A.mb + max_i;
            CORE_damax1_thread( tmp1, thidx, thcnt, &thwin,
                                &tmp2, pivot, jp + 1, IPIV + column );

            if ( thidx == 0 ) {
                Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */
            }
            if (thwin == thidx) { /* the thread that owns the best pivot */
                if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */
                {
                    Atop2 = A( max_it, 0 ) + max_i;
                    *Atop2 = tmp2;
                }
            }
        }

        CORE_dbarrier_thread( thidx, thcnt );

        /* If it is the last column, we just scale */
        if ( column == (min(A.m, A.n))-1 ) {

            pivval = *pivot;
            if ( pivval != 0.0 ) {
                if ( thidx == 0 ) {
                    if ( fabs(pivval) >= sfmin ) {
                        pivval = 1.0 / pivval;

                        /*
                         * We guess than we never enter the function with m == A.mt-1
                         * because it means that there is only one thread
                         */
                        lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;
                        cblas_dscal( lm - column - 1, (pivval), Atop+column+1, 1 );

                        for( it = ft+1; it < lt; it++)
                        {
                            ld = BLKLDD(A, it);
                            Atop2 = A( it, 0 ) + column * ld;
                            lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
                            cblas_dscal( lm, (pivval), Atop2, 1 );
                        }
                    } else {
                        /*
                         * We guess than we never enter the function with m == A.mt-1
                         * because it means that there is only one thread
                         */
                        int i;
                        Atop2 = Atop + column + 1;
                        lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;

                        for( i=0; i < lm-column-1; i++, Atop2++)
                            *Atop2 = *Atop2 / pivval;

                        for( it = ft+1; it < lt; it++)
                            {
                                ld = BLKLDD(A, it);
                                Atop2 = A( it, 0 ) + column * ld;
                                lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;

                                for( i=0; i < lm; i++, Atop2++)
                                    *Atop2 = *Atop2 / pivval;
                            }
                    }
                }
                else
                {
                    if ( fabs(pivval) >= sfmin ) {
                        pivval = 1.0 / pivval;

                        for( it = ft; it < lt; it++)
                        {
                            ld = BLKLDD(A, it);
                            Atop2 = A( it, 0 ) + column * ld;
                            lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;
                            cblas_dscal( lm, (pivval), Atop2, 1 );
                        }
                    } else {
                        /*
                         * We guess than we never enter the function with m == A.mt-1
                         * because it means that there is only one thread
                         */
                        int i;
                        for( it = ft; it < lt; it++)
                        {
                            ld = BLKLDD(A, it);
                            Atop2 = A( it, 0 ) + column * ld;
                            lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;

                            for( i=0; i < lm; i++, Atop2++)
                                *Atop2 = *Atop2 / pivval;
                        }
                    }
                }
            }
            else {
                *info = column + 1;
                return;
            }
        }
    }
}
Beispiel #4
0
int main(int argc, char **argv){


	printf("Computing dsyev...\n");
	
	int n, lda;
	double *A, *Acopy, *work, *w;
	int info, lwork;
	int i,j;
	double t1,t2,elapsed;
	struct timeval tp;
	int rtn;
	double normr, normb;

	n = 100; lda = 100;

	A = (double *)malloc(lda*n*sizeof(double)) ;
	if (A==NULL){ printf("error of memory allocation\n"); exit(0); }
	Acopy = (double *)malloc(lda*n*sizeof(double)) ;
	if (Acopy==NULL){ printf("error of memory allocation\n"); exit(0); }
	w=(double*)malloc(n*sizeof(double));
	
	for(i=0;i<lda*n;i++)
		A[i] = ((double) rand()) / ((double) RAND_MAX) - 0.5;
	for(i=0;i<n;i++)
	{
		for(j=0;j<n;j++)
			A[i+lda*j]=A[j+lda*i];
	}

    cblas_dcopy(lda*n,A,1,Acopy,1); 


	work=malloc(sizeof(double));
	lwork = -1;
	lapack_dsyev( lapack_compute_vectors, lapack_upper, n,  A, lda, w, work, lwork, &info); 

	lwork=work[0];
	free(work);
	work=malloc(lwork*sizeof(double));
	
	lapack_dsyev( lapack_compute_vectors, lapack_upper, n,  A, lda, w, work, lwork, &info); 
	
	double *tmp;
	tmp=(double*)malloc(n*lda*sizeof(double));
	for(i=0;i<lda*n;i++)
		tmp[i]=0;
	for(i=0;i<n;i++)
		tmp[i+lda*i]=1.0e0;
    cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0e0, A, lda, A, lda, -1.0e0, tmp, lda);
	double ortho = 0.0e0;

	double* v;
	v=malloc(n*sizeof(double));
	double* x;
	x=malloc(n*sizeof(double));
	int* isgn;
	isgn=malloc(n*sizeof(int));
	double est;
	int kase;
	
	double *work_dlange;
	work_dlange=malloc(n*sizeof(double));
	ortho = lapack_dlange( lapack_one_norm, n, n, tmp, lda, work_dlange);
	free(work_dlange);
	printf("Orthogonality error : %e\n",ortho);


	for(i=0;i<lda*n;i++)
		tmp[i]=0;
	for(i=0;i<n;i++)
		tmp[i+lda*i]=w[i];

	double *tmp2;
	tmp2=(double*)malloc(n*lda*sizeof(double));
	
    cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0e0, A, lda, tmp, lda, 0.0e0, tmp2, lda);

	for(i=0;i<lda*n;i++)
		tmp[i]=Acopy[i];

    cblas_dgemm ( CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, -1.0e0, tmp2, lda, A, lda, 1.0e0, tmp, lda);

	double normA;
	work_dlange=malloc(n*sizeof(double));
	normA = lapack_dlange( lapack_one_norm, n, n, A, lda, work_dlange);
	free(work_dlange);

	double repr = 0.0e0;
	work_dlange=malloc(n*sizeof(double));
	repr = lapack_dlange( lapack_one_norm, n, n, tmp, lda, work_dlange);
	free(work_dlange);

	printf("Reprentativity error : %e\n",repr);

	free(A);
	free(Acopy);
	free(work);
	free(tmp);
	free(tmp2);

	printf("*******************************************************\n");



	printf("Computing zheev...\n");
	

	n = 300; lda = 300;

	A = (double *)malloc(2*lda*n*sizeof(double)) ;
	if (A==NULL){ printf("error of memory allocation\n"); exit(0); }
	Acopy = (double *)malloc(2*lda*n*sizeof(double)) ;
	if (Acopy==NULL){ printf("error of memory allocation\n"); exit(0); }
	w=(double*)malloc(n*sizeof(double));

	for(i=0;i<2*lda*n;i++)
		A[i] = ((double) rand()) / ((double) RAND_MAX) - 0.5;

	for (i=0;i<n;i++)
		for (j=0;j<n;j++)
		{
			A[2*(i+lda*j)+1] = -A[2*(j+lda*i)+1];
			A[2*(i+lda*j)]   = A[2*(j+lda*i)];
		}

	for (i=0;i<n;i++)
		A[2*(i+lda*i)+1]=0;

	cblas_zcopy(lda*n,A,1,Acopy,1); 

	double *rwork;
	rwork=malloc((3*n-2)*sizeof(double));
	
	work=malloc(2*sizeof(double));
	lwork = -1;
	lapack_zheev( lapack_compute_vectors, lapack_upper, n,  A, lda, w, work, lwork, rwork, &info); 

	lwork=work[0];
	free(work);
	work=malloc(2*lwork*sizeof(double));
	
	lapack_zheev( lapack_compute_vectors, lapack_upper, n,  A, lda, w, work, lwork, rwork, &info); 

	tmp=(double*)malloc(2*n*lda*sizeof(double));
	double alpha[2];
	double beta[2];
	tmp2=(double*)malloc(2*n*lda*sizeof(double));
	alpha[0]=1.0e0;
	alpha[1]=0.0e0;
	beta[0]=-1.0e0;
	beta[1]=0.0e0;

	for (i=0;i<2*n*lda;i++)
		tmp[i]=0;
	for (i=0;i<n;i++)
		tmp[2*(i+lda*i)]=1;	
    cblas_zgemm ( CblasColMajor, CblasNoTrans, CblasConjTrans, n, n, n, alpha, A, lda, A, lda, beta, tmp, lda);


	
	ortho=cblas_dnrm2(2*n*n,tmp,1);
	
	printf("Orthogonality error : %e\n",ortho);
	

	for (i=0;i<n;i++)
	{
		for (j=0;j<n;j++)
		{
			tmp[2*(i+lda*j)]=A[2*(i+lda*j)]*w[j];
			tmp[2*(i+lda*j)+1]=A[2*(i+lda*j)+1]*w[j];
		}
	}
	
	cblas_zcopy(lda*n,Acopy,1,tmp2,1); 
    cblas_zgemm ( CblasColMajor, CblasNoTrans, CblasConjTrans, n, n, n, alpha, tmp, lda, A, lda, beta, tmp2, lda);

	repr=cblas_dnrm2(2*n*n,tmp2,1);
			
	printf("Reprentativity error : %e\n",repr);

	free(A);
	free(Acopy);
	free(work);
	free(tmp);
	free(tmp2);

	exit(0);

}
Beispiel #5
0
struct double_pair randmatstat(int t) {
    int n = 5;
    struct double_pair r;
    double *v = (double*)calloc(t,sizeof(double));
    double *w = (double*)calloc(t,sizeof(double));
    double *a = (double*)malloc((n)*(n)*sizeof(double));
    double *b = (double*)malloc((n)*(n)*sizeof(double));
    double *c = (double*)malloc((n)*(n)*sizeof(double));
    double *d = (double*)malloc((n)*(n)*sizeof(double));
    double *P = (double*)malloc((n)*(4*n)*sizeof(double));
    double *Q = (double*)malloc((2*n)*(2*n)*sizeof(double));
    double *PtP1 = (double*)malloc((4*n)*(4*n)*sizeof(double));
    double *PtP2 = (double*)malloc((4*n)*(4*n)*sizeof(double));
    double *QtQ1 = (double*)malloc((2*n)*(2*n)*sizeof(double));
    double *QtQ2 = (double*)malloc((2*n)*(2*n)*sizeof(double));
    for (int i=0; i < t; i++) {
        randmtzig_fill_randn(a, n*n);
        randmtzig_fill_randn(b, n*n);
        randmtzig_fill_randn(c, n*n);
        randmtzig_fill_randn(d, n*n);
        memcpy(P+0*n*n, a, n*n*sizeof(double));
        memcpy(P+1*n*n, b, n*n*sizeof(double));
        memcpy(P+2*n*n, c, n*n*sizeof(double));
        memcpy(P+3*n*n, d, n*n*sizeof(double));
        for (int j=0; j < n; j++) {
            for (int k=0; k < n; k++) {
                Q[2*n*j+k]       = a[k];
                Q[2*n*j+n+k]     = b[k];
                Q[2*n*(n+j)+k]   = c[k];
                Q[2*n*(n+j)+n+k] = d[k];
            }
        }
        cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                    n, n, 4*n, 1.0, P, 4*n, P, 4*n, 0.0, PtP1, 4*n);
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                    4*n, 4*n, 4*n, 1.0, PtP1, 4*n, PtP1, 4*n, 0.0, PtP2, 4*n);
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                    4*n, 4*n, 4*n, 1.0, PtP2, 4*n, PtP2, 4*n, 0.0, PtP1, 4*n);
        for (int j=0; j < n; j++) {
            v[i] += PtP1[(n+1)*j];
        }
        cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                    2*n, 2*n, 2*n, 1.0, Q, 2*n, Q, 2*n, 0.0, QtQ1, 2*n);
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                    2*n, 2*n, 2*n, 1.0, QtQ1, 2*n, QtQ1, 2*n, 0.0, QtQ2, 2*n);
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                    2*n, 2*n, 2*n, 1.0, QtQ2, 2*n, QtQ2, 2*n, 0.0, QtQ1, 2*n);
        for (int j=0; j < 2*n; j++) {
            w[i] += QtQ1[(2*n+1)*j];
        }
    }
    free(PtP1);
    free(PtP2);
    free(QtQ1);
    free(QtQ2);
    free(P);
    free(Q);
    free(a);
    free(b);
    free(c);
    free(d);
    double v1=0.0, v2=0.0, w1=0.0, w2=0.0;
    for (int i=0; i < t; i++) {
        v1 += v[i]; v2 += v[i]*v[i];
        w1 += w[i]; w2 += w[i]*w[i];
    }
    free(v);
    free(w);
    r.s1 = sqrt((t*(t*v2-v1*v1))/((t-1)*v1*v1));
    r.s2 = sqrt((t*(t*w2-w1*w1))/((t-1)*w1*w1));
    return r;
}
Beispiel #6
0
void dgetrf( long m, long n, double a[], long lda, long ipiv[], long *info )
{
  /**
   *  -- LAPACK routine (version 1.1) --
   *     Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,
   *     Courant Institute, Argonne National Lab, and Rice University
   *     March 31, 1993
   *
   *     .. Scalar Arguments ..*/
  /**     ..
   *     .. Array Arguments ..*/
#undef ipiv_1
#define ipiv_1(a1) ipiv[a1-1]
#undef a_2
#define a_2(a1,a2) a[a1-1+lda*(a2-1)]
  /**     ..
   *
   *  Purpose
   *  =======
   *
   *  DGETRF computes an LU factorization of a general M-by-N matrix A
   *  using partial pivoting with row interchanges.
   *
   *  The factorization has the form
   *     A = P * L * U
   *  where P is a permutation matrix, L is lower triangular with unit
   *  diagonal elements (lower trapezoidal if m > n), and U is upper
   *  triangular (upper trapezoidal if m < n).
   *
   *  This is the right-looking Level 3 BLAS version of the algorithm.
   *
   *  Arguments
   *  =========
   *
   *  M       (input) INTEGER
   *          The number of rows of the matrix A.  M >= 0.
   *
   *  N       (input) INTEGER
   *          The number of columns of the matrix A.  N >= 0.
   *
   *  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N)
   *          On entry, the M-by-N matrix to be factored.
   *          On exit, the factors L and U from the factorization
   *          A = P*L*U; the unit diagonal elements of L are not stored.
   *
   *  LDA     (input) INTEGER
   *          The leading dimension of the array A.  LDA >= max(1,M).
   *
   *  IPIV    (output) INTEGER array, dimension (min(M,N))
   *          The pivot indices; for 1 <= i <= min(M,N), row i of the
   *          matrix was interchanged with row IPIV(i).
   *
   *  INFO    (output) INTEGER
   *          = 0:  successful exit
   *          < 0:  if INFO = -i, the i-th argument had an illegal value
   *          > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
   *                has been completed, but the factor U is exactly
   *                singular, and division by zero will occur if it is used
   *                to solve a system of equations.
   *
   *  =====================================================================
   *
   *     .. Parameters ..*/
#undef one
#define one 1.0e+0
  /**     ..
   *     .. Local Scalars ..*/
  long            i, iinfo, j, jb, nb;
  /**     ..
   *     .. Intrinsic Functions ..*/
  /*      intrinsic          max, min;*/
  /**     ..
   *     .. Executable Statements ..
   *
   *     Test the input parameters.
   **/
  /*-----implicit-declarations-----*/
  /*-----end-of-declarations-----*/
  *info = 0;
  if( m<0 ) {
    *info = -1;
  } else if( n<0 ) {
    *info = -2;
  } else if( lda<max( 1, m ) ) {
    *info = -4;
  }
  if( *info!=0 ) {
    xerbla( "dgetrf", -*info );
    return;
  }
  /**
   *     Quick return if possible
   **/
  if( m==0 || n==0 )
    return;
  /**
   *     Determine the block size for this environment.
   **/
  nb = ilaenv( 1, "dgetrf", " ", m, n, -1, -1 );
  if( nb<=1 || nb>=min( m, n ) ) {
    /**
     *        Use unblocked code.
     **/
    dgetf2( m, n, a, lda, ipiv, info );
  } else {
    /**
     *        Use blocked code.
     **/
    for (j=1 ; nb>0?j<=min( m, n ):j>=min( m, n ) ; j+=nb) {
      jb = min( min( m, n )-j+1, nb );
      /**
       *           Factor diagonal and subdiagonal blocks and test for exact
       *           singularity.
       **/
      dgetf2( m-j+1, jb, &a_2( j, j ), lda, &ipiv_1( j ), &iinfo );
      /**
       *           Adjust INFO and the pivot indices.
       **/
      if( *info==0 && iinfo>0 )
        *info = iinfo + j - 1;
      for (i=j ; i<=min( m, j+jb-1 ) ; i+=1) {
        ipiv_1( i ) = j - 1 + ipiv_1( i );
      }
      /**
       *           Apply interchanges to columns 1:J-1.
       **/
      dlaswp( j-1, a, lda, j, j+jb-1, ipiv, 1 );

      if( j+jb<=n ) {
        /**
         *              Apply interchanges to columns J+JB:N.
         **/
        dlaswp( n-j-jb+1, &a_2( 1, j+jb ), lda, j, j+jb-1,
               ipiv, 1 );
        /**
         *              Compute block row of U.
         **/
        cblas_dtrsm(CblasColMajor, CblasLeft, CblasLower, CblasNoTrans,
                    CblasUnit, jb, n-j-jb+1, one, &a_2( j, j ), lda,
                    &a_2( j, j+jb ), lda );
        if( j+jb<=m ) {
          /**
           *                 Update trailing submatrix.
           **/
          cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m-j-jb+1,
                      n-j-jb+1, jb, -one, &a_2( j+jb, j ), lda,
                      &a_2( j, j+jb ), lda, one, &a_2( j+jb, j+jb ), lda );
        }
      }
    }
  }
  return;
  /**
   *     End of DGETRF
   **/
}
/*
 * Use DIIS to help SCF
 */
void calculateSCFDIIS(molecule_t *molecule) {
#define EPS 0.0000000000001
#define DEL 0.0000000000001
	double **fs[6], **es[6], **b, *c;
	int **piv;

	hamiltonian(molecule);
	sqrtMolecule(molecule);

	int n = molecule->orbitals;	//So that the same thing does not need to be typed repeatedly.
	int count = 0;

	double elec, energy = 0, elast, rms;

	double **f0, **f1, **f2, **c0, **c1, **d0, **d1, **work1, **work2, **work3,
	        **ham, **shalf, **s;

	double **sort;

	f0 = calloc_contiguous(2, sizeof(double), n, n);
	f1 = calloc_contiguous(2, sizeof(double), n, n);
	f2 = calloc_contiguous(2, sizeof(double), n, n);
	c0 = calloc_contiguous(2, sizeof(double), n, n);
	c1 = calloc_contiguous(2, sizeof(double), n, n);
	d0 = calloc_contiguous(2, sizeof(double), n, n);
	d1 = calloc_contiguous(2, sizeof(double), n, n);
	work1 = calloc_contiguous(2, sizeof(double), n, n);
	work2 = calloc_contiguous(2, sizeof(double), n, n);
	work3 = calloc_contiguous(2, sizeof(double), n, n);
	ham = calloc_contiguous(2, sizeof(double), n, n);
	shalf = calloc_contiguous(2, sizeof(double), n, n);
	sort = calloc_contiguous(2, sizeof(double), n, n);
	b = calloc_contiguous(2, sizeof(double), 7, 7);
	c = calloc(7, sizeof(double));
	s = calloc_contiguous(2, sizeof(double), n, n);
	piv = calloc_contiguous(2, sizeof(double), 7, 7);
	for(int i = 0; i < 6; i++) {
		fs[i] = calloc_contiguous(2, sizeof(double), n, n);
		es[i] = calloc_contiguous(2, sizeof(double), n, n);
	}

	for(int i = 0; i < n; i++) {
		for(int j = 0; j < n; j++) {
			s[i][j] = molecule->overlap[i][j];
			shalf[i][j] = molecule->symmetric[i][j];
		}
	}
	printf("\nElec\t\tEnergy\t\tDiff\t\tRMS\n");
	do {
		elast = energy;
		if(count == 0) {
			for(int i = 0; i < n; i++) {
				for(int j = 0; j < n; j++) {
					//Find the initial Fock guess.
					f0[i][j] = ham[i][j] = molecule->hamiltonian[i][j];
				}
			}
		} else {
			memcpy(*d1, *d0, n * n * sizeof(double));
			for(int i = 0; i < n; i++) {
				for(int j = 0; j < n; j++) {
					f0[i][j] = ham[i][j];
					for(int k = 0; k < n; k++) {
						for(int l = 0; l < n; l++) {
							f0[i][j] += d0[k][l]
							    * (2 * molecule->two_electron[TEI(i, j, k, l)]
							        - molecule->two_electron[TEI(i, k, j, l)]);
						}
					}
				}
			}
		}

		//DIIS extrapolation.
		memcpy(*(fs[count % 6]), *f0, n * n * sizeof(double));
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, *s,
		    n, *d0, n, 0, *work1, n);
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
		    *work1, n, *f0, n, 0, *work2, n);
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
		    *f0, n, *d0, n, 0, *work1, n);
		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
		    *work1, n, *s, n, 0, *work3, n);
		for(int i = 0; i < n; i++) {
			for(int j = 0; j < n; j++) {
				es[count % 6][i][j] = work3[i][j] - work2[i][j];
			}
		}
		if(count >= 6) {
			for(int i = 0; i < ((count > 6)? 6: count); i++) {
				for(int j = 0; j < ((count > 6)? 6: count); j++) {
					b[i][j] = 0;
					for(int k = 0; k < n; k++) {
						for(int l = 0; l < n; l++) {
							b[i][j] += es[i][k][l] * es[j][k][l];
						}
					}
				}
			}

			if(count < 6) {
				for(int i = 0; i < 6; i++) {
					for(int j = 0; j < 6; j++) {
						if(i < count && j < count) {
							continue;
						}
						if(i == j) {
							b[i][j] = 1;
						} else {
							b[i][j] = 0;
						}
					}
				}
			}

			for(int i = 0; i < 6; i++) {
				b[6][i] = -1;
				b[i][6] = -1;
				c[i] = 0;
			}
			b[6][6] = 0;
			c[6] = -1;
			LAPACKE_dgesv(LAPACK_ROW_MAJOR, 7, 1, *b, 7, *piv, c, 1);

			for(int i = 0; i < n; i++) {
				for(int j = 0; j < n; j++) {
					f2[i][j] = 0;
					for(int m = 0; m < 6; m++) {
						f2[i][j] += c[m] * fs[m][i][j];
					}
				}
			}

			cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n, n, n, 1.0,
			    *shalf, n, *f2, n, 0, *work1, n);
			cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
			    *work1, n, *shalf, n, 0, *f1, n);
		} else {
			cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, n, n, n, 1.0,
			    *shalf, n, *f0, n, 0, *work1, n);
			cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
			    *work1, n, *shalf, n, 0, *f1, n);
		}

		memset(work1[0], 0, n * n * sizeof(double));
		memset(work2[0], 0, n * n * sizeof(double));
		memset(work3[0], 0, n * n * sizeof(double));

		LAPACKE_dgeev(LAPACK_ROW_MAJOR, 'N', 'V', n, *f1, n, *work1, *work2,
		    *work3, n, *c1, n);

		//Prepare for sorting.

		for(int i = 0; i < n; i++) {
			for(int j = 0; j < n; j++) {
				work2[i][j] = c1[i][j];
			}
		}

		//Sort
		for(int i = 0; i < n; i++) {
			sort[i] = work1[0] + i;
		}

		qsort(sort, n, sizeof(double *), comparedd);

		//Sift through data.
		for(int i = 0; i < n; i++) {
			unsigned long off = ((unsigned long) sort[i]
			    - (unsigned long) work1[0]);
			off /= sizeof(double);
			for(int j = 0; j < n; j++) {
				c1[j][i] = work2[j][off];
			}

		}

		cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0,
		    *shalf, n, *c1, n, 0, *c0, n);

		for(int i = 0; i < n; i++) {
			for(int j = 0; j < n; j++) {
				d0[i][j] = 0;
				for(int k = 0; k < molecule->electrons / 2; k++) {
					d0[i][j] += c0[i][k] * c0[j][k];
				}
			}
		}

		elec = 0;
		for(int i = 0; i < n; i++) {
			for(int j = 0; j < n; j++) {
				elec += d0[i][j] * (ham[i][j] + f0[i][j]);
			}
		}
		energy = elec + molecule->enuc;
		rms = 0;
		for(int i = 0; i < n; i++) {
			for(int j = 0; j < n; j++) {
				rms += (d0[i][j] - d1[i][j]) * (d0[i][j] - d1[i][j]);
			}
		}
		rms = sqrt(rms);

		count++;
		printf("%d\t%.15f\t%.15f\t%.15f\t%.15f\n", count, elec, energy, fabs(elast - energy), rms);

	} while(count < 100 && (fabs(elast - energy) > EPS && rms > DEL));

	molecule->scf_energy = energy;

	for(int i = 0; i < n; i++) {
		for(int j = 0; j < n; j++) {
			molecule->density[i][j] = d0[i][j];
			molecule->fock[i][j] = f0[i][j];
			molecule->molecular_orbitals[i][j] = c0[i][j];
			molecule->molecular_eigs[i][j] = ((i == j)? sort[i][0]: 0);
		}
	}

	free_mult_contig(16, c0, c1, d0, d1, f0, f1, f2, ham, shalf, work1, work2,
	    work3, sort, b, c, s);
	for(int i = 0; i < 6; i++) {
		free(fs[i]);
		free(es[i]);
	}
}
Beispiel #8
0
// This dMList function creates a list of the dM values giving the probabilities of mutations
static PyObject *dMList(PyObject *self, PyObject *args) {
    // Calling variables are (in order): uts, only_need, n_aa, length, grs, dmlist, residue_to_compute, iwt, brs
    PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *dmlist, *brs, *brz;
    long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, x, y, index, only_need_index, only_need_i, only_need_i_n_aa, index2, iwt, n_aa3, z;
    double *arr_dmlist, *cp_inv, *cp, *cgr_diag, *cexpd, *cbrz, *cvrz, *cvrz_p_inv;
    complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cexpd, *complex_naa2_list, *complex_naa_list, *complex_cvrz, *complex_cvrz_p_inv, *complex_cbrz;
    double ut, exp_utdx, exp_utdy, dx, dy;
    complex double complex_exp_utdx, complex_exp_utdy, complex_dx, complex_dy;
    int array_type;
#ifdef USE_ACCELERATE_CBLAS
    complex double complex_one = 1, complex_zero = 0;
#else
    complex double complex_dmxy, complex_v_p_inv_xy;
    double dmxy, v_p_inv_xy;
    long yindex, irowcolumn;
#endif
    // Parse the arguments.  
    if (! PyArg_ParseTuple( args, "O!O!llO!O!llO!", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &dmlist, &residue_to_compute, &iwt, &PyList_Type, &brs)) {
        PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to dMList.");
        return NULL;
    }
    // Error checking on arguments
    if (length < 1) { // length of the protein
        PyErr_SetString(PyExc_ValueError, "length is less than one.");
        return NULL;
    }
    if (n_aa < 1) { // number of amino acids.  Normally will be 20.
        PyErr_SetString(PyExc_ValueError, "n_aa is less than one.");
        return NULL;
    }
    if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length
        PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length.");
        return NULL;
    }
    n_uts = PyList_GET_SIZE(uts); // number of entries in uts
    if (n_uts < 1) { // make sure there are entries in uts 
        PyErr_SetString(PyExc_ValueError, "uts has no entries.");
        return NULL;
    }
    if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size
        PyErr_SetString(PyExc_ValueError, "only_need is of wrong size.");
        return NULL;
    }
    if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) {
        PyErr_SetString(PyExc_ValueError, "Invalid value for residue_to_compute.");
        return NULL;
    }
    if (! ((iwt >= 0) && (iwt < n_aa))) {
        PyErr_SetString(PyExc_ValueError, "Invalid value for iwt.");
        return NULL;
    }
    if (PyList_GET_SIZE(brs) != n_aa) { // make sure brs has one entry for each amino acid
        PyErr_SetString(PyExc_ValueError, "brs is not of length equal to n_aa");
        return NULL;
    }
    n_aa2 = n_aa * n_aa; // square of the number of amino acids
    n_aa3 = n_aa2 * n_aa; // cube of the number of amino acids
    // The results will be returned in a numpy ndarray 'float_' (C type double) array called dmlist.
    // This array will be of size length * n_uts * n_aa3.
    arr_dmlist = (double *) PyArray_DATA(dmlist); // this is the data array of dmlist
    long const sizeof_cexpd = n_aa2 * sizeof(double);
    long const complex_sizeof_cexpd = n_aa2 * sizeof(complex double);
    // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr
    r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute);
    gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); 
    p = PyTuple_GET_ITEM(r_grs_tuple, 1);
    p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2);
    // Now begin filling arr_dmlist with the appropriate values
    index = 0;
    only_need_index = residue_to_compute * n_uts;
    // determine if these arrays are complex double or real doubles
    array_type = PyArray_TYPE(gr_diag);
    if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex
        // Note that these next assignments assume that the arrays are C-style contiguous
        cp = PyArray_DATA(p);
        cp_inv = PyArray_DATA(p_inv);
        cgr_diag = PyArray_DATA(gr_diag);
        cexpd = (double *) malloc(sizeof_cexpd); // allocate memory
        cvrz = (double *) malloc(sizeof_cexpd); // allocate memory
        cvrz_p_inv = (double *) malloc(sizeof_cexpd); // allocate memory
        for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values
            only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need
            only_need_index++;
            if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist
                index += n_aa3;
            } else { // we need to compute at least some entries in dmlist
                ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut));  // ut value
                // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y)
                // for x != y, and D_yy = ut exp(d_x ut)
                index2 = 0;
                for (x = 0; x < n_aa; x++) {
                    dx = cgr_diag[x];
                    exp_utdx = exp(ut * dx);
                    for (y = 0; y < n_aa; y++) {
                        if (y == x) {
                            cexpd[index2] = ut * exp_utdx;
                        } else {
                            dy = cgr_diag[y];
                            exp_utdy = exp(ut * dy);
                            cexpd[index2] = (exp_utdx - exp_utdy) / (dx - dy);
                        }
                        index2++;
                    }
                }
                for (z = 0; z < n_aa; z++) {
                    // compute derivative with respect to z
                    if (z == iwt) { // don't compute values with respect to wildtype
                        index += n_aa2;
                        continue;
                    }
                    brz = PyList_GET_ITEM(brs, z);
                    cbrz = PyArray_DATA(brz);
                    // cvrz is the element-by-element product of cbrz and cexpd
                    for (index2 = 0; index2 < n_aa2; index2++) {
                        cvrz[index2] = cbrz[index2] * cexpd[index2];
                    }
#ifdef USE_ACCELERATE_CBLAS
                    // multiply the matrices using cblas_dgemm
                    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cvrz, n_aa, cp_inv, n_aa, (double) 0.0, cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv
#else
                    // multiply the matrices in pure C code
                    // multiply cvrz and cp_inv into cvrz_p_inv
                    index2 = 0;
                    for (x = 0; x < n_aa2; x += n_aa) {
                        for (y = 0; y < n_aa; y++) {
                            v_p_inv_xy = 0.0;
                            yindex = y;
                            for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                v_p_inv_xy += cvrz[irowcolumn] * cp_inv[yindex];
                                yindex += n_aa;
                            }
                            cvrz_p_inv[index2++] = v_p_inv_xy;
                        }
                    }
#endif
                    if (only_need_i == -2) { // we need to compute all of these entries in dmlist
#ifdef USE_ACCELERATE_CBLAS
                        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cvrz_p_inv, n_aa, (double) 0.0, &arr_dmlist[index], n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2]
                        index += n_aa2;
#else
                        // multiply the matrices in pure C code, and fill dmlist with the results
                        // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2]
                        for (x = 0; x < n_aa2; x += n_aa) {
                            for (y = 0; y < n_aa; y++) {
                                dmxy = 0.0;
                                yindex = y;
                                for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                    dmxy += cp[irowcolumn] * cvrz_p_inv[yindex];
                                    yindex += n_aa;
                                }
                                arr_dmlist[index++] = dmxy;
                            }
                        }
#endif
                    } else { // we need to compute entries in dmlist only for x = only_need_i
                        only_need_i_n_aa = only_need_i * n_aa;
                        index += only_need_i_n_aa;
#ifdef USE_ACCELERATE_CBLAS
                        // do the matrix vector multiplication using cblas, and put results in dmlist
                        cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cvrz_p_inv, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_dmlist[index], 1); 
                        index += n_aa2 - only_need_i_n_aa;
#else
                        // do the matrix vector multiplication in pure C code, and put results in mlist
                        for (y = 0; y < n_aa; y++) {
                            dmxy = 0.0;
                            yindex = y;
                            for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) {
                                dmxy += cp[irowcolumn] * cvrz_p_inv[yindex];
                                yindex += n_aa;
                            }
                            arr_dmlist[index++] = dmxy;
                        }
                        index += n_aa2 - only_need_i_n_aa - n_aa;
#endif
                    }
                }
            }
        }
        free(cexpd);
        free(cvrz);
        free(cvrz_p_inv);
    } else if (array_type == NPY_CDOUBLE) { // array is of complex doubles
        // Note that these next assignments assume that the arrays are C-style contiguous
        complex_cp = PyArray_DATA(p);
        complex_cp_inv = PyArray_DATA(p_inv);
        complex_cgr_diag = PyArray_DATA(gr_diag);
        complex_cexpd = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory
        complex_cvrz = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory
        complex_cvrz_p_inv = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory
        complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory
        complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory
        for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values
            only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need
            only_need_index++;
            if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist
                index += n_aa3;
            } else { // we need to compute at least some entries in dmlist
                ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut));  // ut value
                // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y)
                // for x != y, and D_yy = ut exp(d_x ut)
                index2 = 0;
                for (x = 0; x < n_aa; x++) {
                    complex_dx = complex_cgr_diag[x];
                    complex_exp_utdx = cexp(ut * complex_dx);
                    for (y = 0; y < n_aa; y++) {
                        if (y == x) {
                            complex_cexpd[index2] = ut * complex_exp_utdx;
                        } else {
                            complex_dy = complex_cgr_diag[y];
                            complex_exp_utdy = cexp(ut * complex_dy);
                            complex_cexpd[index2] = (complex_exp_utdx - complex_exp_utdy) / (complex_dx - complex_dy);
                        }
                        index2++;
                    }
                }
                for (z = 0; z < n_aa; z++) {
                    // compute derivative with respect to z
                    if (z == iwt) { // don't compute values with respect to wildtype
                        index += n_aa2;
                        continue;
                    }
                    brz = PyList_GET_ITEM(brs, z);
                    complex_cbrz = PyArray_DATA(brz);
                    // cvrz is the element-by-element product of cbrz and cexpd
                    for (index2 = 0; index2 < n_aa2; index2++) {
                        complex_cvrz[index2] = complex_cbrz[index2] * complex_cexpd[index2];
                    }
#ifdef USE_ACCELERATE_CBLAS
                    // multiply the matrices using cblas_zgemm
                    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cvrz, n_aa, complex_cp_inv, n_aa, &complex_zero, complex_cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv
#else
                    // multiply the matrices in pure C code
                    // multiply cvrz and cp_inv into cvrz_p_inv
                    index2 = 0;
                    for (x = 0; x < n_aa2; x += n_aa) {
                        for (y = 0; y < n_aa; y++) {
                            complex_v_p_inv_xy = 0.0;
                            yindex = y;
                            for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                complex_v_p_inv_xy += complex_cvrz[irowcolumn] * complex_cp_inv[yindex];
                                yindex += n_aa;
                            }
                            complex_cvrz_p_inv[index2++] = complex_v_p_inv_xy;
                        }
                    }
#endif
                    if (only_need_i == -2) { // we need to compute all of these entries in dmlist
#ifdef USE_ACCELERATE_CBLAS
                        cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cvrz_p_inv, n_aa, &complex_zero, complex_naa2_list, n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2]
                        for (index2 = 0; index2 < n_aa2; index2++) {
                            arr_dmlist[index + index2] = creal(complex_naa2_list[index2]);
                        }
                        index += n_aa2;
#else
                        // multiply the matrices in pure C code, and fill dmlist with the results
                        // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2]
                        for (x = 0; x < n_aa2; x += n_aa) {
                            for (y = 0; y < n_aa; y++) {
                                complex_dmxy = 0.0;
                                yindex = y;
                                for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                    complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex];
                                    yindex += n_aa;
                                }
                                arr_dmlist[index++] = creal(complex_dmxy);
                            }
                        }
#endif
                    } else { // we need to compute entries in dmlist only for x = only_need_i
                        only_need_i_n_aa = only_need_i * n_aa;
                        index += only_need_i_n_aa;
#ifdef USE_ACCELERATE_CBLAS
                        // do the matrix vector multiplication using cblas, and put results in dmlist
                        cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cvrz_p_inv, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1); 
                        for (index2 = 0; index2 < n_aa; index2++) {
                            arr_dmlist[index + index2] = creal(complex_naa_list[index2]);
                        }
                        index += n_aa2 - only_need_i_n_aa;
#else
                        // do the matrix vector multiplication in pure C code, and put results in mlist
                        for (y = 0; y < n_aa; y++) {
                            complex_dmxy = 0.0;
                            yindex = y;
                            for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) {
                                complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex];
                                yindex += n_aa;
                            }
                            arr_dmlist[index++] = creal(complex_dmxy);
                        }
                        index += n_aa2 - only_need_i_n_aa - n_aa;
#endif
                    }
                }
            }
        }
        free(complex_cexpd);
        free(complex_cvrz);
        free(complex_cvrz_p_inv);
        free(complex_naa2_list);
        free(complex_naa_list);
    } else { // array is of neither real nor complex doubles
        PyErr_SetString(PyExc_ValueError, "matrices are neither double nor complex doubles.");
        return NULL;
    }
    return PyInt_FromLong((long) 1);
}
Beispiel #9
0
template <typename fptype> static inline int
lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
            const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
{
    int ldsrc1 = src1_step / sizeof(fptype);
    int ldsrc2 = src2_step / sizeof(fptype);
    int ldsrc3 = src3_step / sizeof(fptype);
    int lddst = dst_step / sizeof(fptype);
    int c_m, c_n, d_m;
    CBLAS_TRANSPOSE transA, transB;

    if(flags & CV_HAL_GEMM_2_T)
    {
        transB = CblasTrans;
        if(flags & CV_HAL_GEMM_1_T )
        {
            d_m = a_n;
        }
        else
        {
            d_m = a_m;
        }
    }
    else
    {
        transB = CblasNoTrans;
        if(flags & CV_HAL_GEMM_1_T )
        {
            d_m = a_n;
        }
        else
        {
            d_m = a_m;
        }
    }

    if(flags & CV_HAL_GEMM_3_T)
    {
        c_m = d_n;
        c_n = d_m;
    }
    else
    {
        c_m = d_m;
        c_n = d_n;
    }

    if(flags & CV_HAL_GEMM_1_T )
    {
        transA = CblasTrans;
        std::swap(a_n, a_m);
    }
    else
    {
        transA = CblasNoTrans;
    }

    if(src3 != dst && beta != 0.0 && src3_step != 0) {
        if(flags & CV_HAL_GEMM_3_T)
            transpose(src3, ldsrc3, dst, lddst, c_m, c_n);
        else
            copy_matrix(src3, ldsrc3, dst, lddst, c_m, c_n);
    }
    else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV
        return CV_HAL_ERROR_NOT_IMPLEMENTED;
    else if(src3_step == 0 && beta != 0.0)
        set_value(dst, lddst, (fptype)0.0, d_m, d_n);

    if(typeid(fptype) == typeid(float))
        cblas_sgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float)alpha, (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float)beta, (float*)dst, lddst);
    else if(typeid(fptype) == typeid(double))
        cblas_dgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double)alpha, (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double)beta, (double*)dst, lddst);

    return CV_HAL_ERROR_OK;
}
Beispiel #10
0
int main(int argc, char * argv[]) {

	std::string file;

	int RDN = 0;

	char c;
	while ((c = getopt(argc, argv, "A:R:")) != -1) {
		switch (c) {
		case 'A':
			file = optarg;
			break;
		case 'R':
			RDN = atoi(optarg);
			break;
		}
	}

//	file= "../../SVM/epsilon_normalized";
//	file = "data/dense.svm";
//	file = "data/a1a";
//	file = "../../SVM/random_dense";

	stringstream ss("");
	ss << file << "_log";

	ofstream logFile;
	logFile.open(ss.str().c_str());

	omp_set_num_threads(1);

	const int MAXIMUM_THREADS = 64;
	std::vector<gsl_rng *> rs = randomNumberUtil::inittializeRandomSeeds(
			MAXIMUM_THREADS);

	int n = 1000; // this value was used for experiments
	int m = n * 2;

	randomNumberUtil::init_random_seeds(rs);
//--------------------- run experiment - one can change precission here

//	string inputFile, int file, int totalFiles,
//			ProblemData<L, D> & part, bool zeroBased

	ProblemData<int, double> part;
	std::vector<double> A;
	std::vector<double> b;

	if (RDN == 0) {
		loadDistributedSparseSVMRowData(file, -1, -1, part, false);
		m = part.m;
		n = part.n;
//		std::vector<double> &A = part.A_csr_values;
//		std::vector<double> &b = part.b;
	} else {

		switch (RDN) {
		case 1:

			n = 2048;
			m = n / 2;

			break;

		case 2:

			n = 2048;
			m = n * 2;

			break;

		case 3:

			n = 2048;
			m = n;

			break;

		default:
			break;
		}

		A.resize(m * n, 0);
		for (int i = 0; i < n; i++) {
			for (int j = 0; j < m; j++) {
				A[i * m + j] = -1 + 2 * rand() / (0.0 + RAND_MAX);
			}
			double norm = cblas_l2_norm(m, &A[m * i], 1);
			cblas_vector_scale(m, &A[m * i], 1 / norm);
		}
		b.resize(n);
		for (int i = 0; i < b.size(); i++) {
			b[i] = -1 + 2 * round(rand() / (0.0 + RAND_MAX));
		}
	}
	std::vector<double> Li(n, 0);
	std::vector<double> LiSqInv(n, 0);

	bool dense = (m * n == A.size());

	if (dense) {
		cout << "Input data is dense!!!" << endl;
		for (int i = 0; i < n; i++) {
			Li[i] = 0;
			LiSqInv[i] = 0;
			for (int j = 0; j < m; j++) {
				Li[i] += A[i * m + j] * A[i * m + j];
			}
			if (Li[i] > 0) {
				LiSqInv[i] = 1 / sqrt(Li[i]);
			}
		}

	} else {
		cout << "Input data is sparse!!!  " << part.A_csr_row_ptr.size()
				<< endl;

		for (int i = 0; i < n; i++) {
			Li[i] = 0;
			LiSqInv[i] = 0;
			for (int j = part.A_csr_row_ptr[i]; j < part.A_csr_row_ptr[i + 1];
					j++) {
				Li[i] += part.A_csr_values[j] * part.A_csr_values[j];
			}
			if (Li[i] > 0) {
				LiSqInv[i] = 1 / sqrt(Li[i]);
			}
		}

	}
	cout << "Stage 2" << endl;

	std::vector<double> x(n);
	std::vector<double> y(m);

	for (int i = 0; i < n; i++) {
		x[i] = rand() / (0.0 + RAND_MAX);
	}
	double norm = cblas_l2_norm(n, &x[0], 1);
	cblas_vector_scale(n, &x[0], 1 / norm);

	double maxEig = 1;

	for (int PM = 0; PM < 20; PM++) {

		if (dense) {
			for (int j = 0; j < m; j++) {
				y[j] = 0;

				for (int i = 0; i < n; i++) {
					y[j] += A[i * m + j] * LiSqInv[i] * x[i];
				}
			}

		} else {
			for (int j = 0; j < m; j++) {
				y[j] = 0;
			}
			for (int i = 0; i < n; i++) {
				for (int j = part.A_csr_row_ptr[i];
						j < part.A_csr_row_ptr[i + 1]; j++) {
					y[part.A_csr_col_idx[j]] += part.A_csr_values[j]
							* LiSqInv[i] * x[i];
				}

			}

		}

		for (int i = 0; i < n; i++) {
			x[i] = 0;
			if (dense) {
				for (int j = 0; j < m; j++) {
					x[i] += A[i * m + j] * LiSqInv[i] * y[j];
				}
			} else {
				for (int j = part.A_csr_row_ptr[i];
						j < part.A_csr_row_ptr[i + 1]; j++) {
					x[i] += part.A_csr_values[j] * LiSqInv[i]
							* y[part.A_csr_col_idx[j]];
				}

			}

		}

		maxEig = cblas_l2_norm(n, &x[0], 1);
		cout << maxEig << endl;
		cblas_vector_scale(n, &x[0], 1 / maxEig);

	}

	cout << "Max eigenvalue estimated " << endl;

	x.resize(0);
	y.resize(0);
	LiSqInv.resize(0);

	double lambda = 1 / (n + 0.0);

	double maxTime = 1000;

	std::vector<double> Hessian;
	if (n < 10000) {
		Hessian.resize(n * n);

		if (dense) {

			cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, n, n, m, 1.0,
					&A[0], m, &A[0], m, 0, &Hessian[0], n);

		} else {

			std::vector<double>& vals = part.A_csr_values;
			std::vector<int> &rowPtr = part.A_csr_row_ptr;
			std::vector<int> &colIdx = part.A_csr_col_idx;

			for (int row = 0; row < n; row++) {
				for (int col = row; col < n; col++) {

					double tmp = 0;

					int id1 = rowPtr[row];
					int id2 = rowPtr[col];

					while (id1 < rowPtr[row + 1] && id2 < rowPtr[col + 1]) {

						if (colIdx[id1] == colIdx[id2]) {
							tmp += vals[id1] * vals[id2];
							id1++;
							id2++;
						} else if (colIdx[id1] < colIdx[id2]) {
							id1++;
						} else {
							id2++;
						}

					}

					Hessian[row * n + col] = tmp;
					Hessian[col * n + row] = tmp;

				}
			}
		}
	}

	int MAXTAU = n / 2;
	if (MAXTAU > 1024 * 8) {
		MAXTAU = 1024 * 8;
	}

	for (int tau = 1; tau <= MAXTAU; tau = tau * 2) {
//		int tau = 1;
//		omp_set_num_threads(tau);

		double sigma = 1 + (tau - 1) * (maxEig - 1) / (n - 1.0);

		if (dense) {
			randomNumberUtil::init_random_seeds(rs);

			runPCDMExperiment(m, n, A, b, lambda, tau, logFile, Li, sigma,
					maxTime);
			randomNumberUtil::init_random_seeds(rs);

			runCDNExperiment(m, n, A, b, lambda, tau, logFile, 1, maxTime,
					Hessian);
			randomNumberUtil::init_random_seeds(rs);

			runCDNExperiment(m, n, A, b, lambda, tau, logFile, 2, maxTime,
					Hessian);
		} else {

			randomNumberUtil::init_random_seeds(rs);
			runPCDMExperimentSparse(m, n, part, part.b, lambda, tau, logFile,
					Li, sigma, maxTime);
			randomNumberUtil::init_random_seeds(rs);
			runCDNExperimentSparse(m, n, part, part.b, lambda, tau, logFile, 1,
					maxTime, Hessian);
			randomNumberUtil::init_random_seeds(rs);
			runCDNExperimentSparse(m, n, part, part.b, lambda, tau, logFile, 2,
					maxTime, Hessian);

		}
	}

	logFile.close();

//	histogramLogFile.close();
//	experimentLogFile.close();
	return 0;
}
Beispiel #11
0
// This MList function creates a list of the M values giving the probabilities of mutations
static PyObject *MList(PyObject *self, PyObject *args) {
    // Calling variables are (in order): uts, only_need, n_aa, length, grs, mlist, residue_to_compute
    PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *mlist;
    long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, y, index, irowcolumn, only_need_index, only_need_i, only_need_i_n_aa;
    double *arr_mlist, *cp_inv, *cp, *cgr_diag, *cp_inv_exp;
    complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cp_inv_exp, *complex_naa2_list, *complex_naa_list;
    double ut, exp_ut;
    complex double complex_exp_ut;
    int array_type;
#ifdef USE_ACCELERATE_CBLAS
    complex double complex_one = 1, complex_zero = 0;
    long index2;
#else
    complex double complex_mxy;
    double mxy;
    long x, yindex;
#endif
    // Parse the arguments.  
    if (! PyArg_ParseTuple( args, "O!O!llO!O!l", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &mlist, &residue_to_compute)) {
        PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to MList.");
        return NULL;
    }
    // Error checking on arguments
    if (length < 1) { // length of the protein
        PyErr_SetString(PyExc_ValueError, "length is less than one.");
        return NULL;
    }
    if (n_aa < 1) { // number of amino acids.  Normally will be 20.
        PyErr_SetString(PyExc_ValueError, "n_aa is less than one.");
        return NULL;
    }
    if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length
        // PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length.");
        char errstring[200];
        sprintf(errstring, "grs is not of the same size as length: %ld, %ld", PyList_GET_SIZE(grs), length);
        PyErr_SetString(PyExc_ValueError, errstring);
        return NULL;
    }
    n_uts = PyList_GET_SIZE(uts); // number of entries in uts
    if (n_uts < 1) { // make sure there are entries in uts 
        PyErr_SetString(PyExc_ValueError, "uts has no entries.");
        return NULL;
    }
    if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size
        PyErr_SetString(PyExc_ValueError, "only_need is of wrong size.");
        return NULL;
    }
    if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) {
        char errstring[200];
        sprintf(errstring, "Invalid value for residue_to_compute: %ld, %ld", residue_to_compute, length);
        PyErr_SetString(PyExc_ValueError, errstring);
        return NULL;
    }
    n_aa2 = n_aa * n_aa; // square of the number of amino acids
    // The results will be returned in a numpy ndarray 'float_' (C type double) array called mlist.
    // This array will be of size length * n_uts * n_aa2.
    arr_mlist = (double *) PyArray_DATA(mlist); // this is the data array of mlist
    long const sizeof_cp_inv = n_aa2 * sizeof(double);
    long const complex_sizeof_cp_inv = n_aa2 * sizeof(complex double);
    // Now begin filling arr_mlist with the appropriate values
    index = 0;
    only_need_index = residue_to_compute * n_uts;
    r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute);
    // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr
    gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); 
    p = PyTuple_GET_ITEM(r_grs_tuple, 1);
    p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2);
    // determine if these arrays are complex double or real doubles
    array_type = PyArray_TYPE(gr_diag);
    if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex
        cp_inv_exp = (double *) malloc(sizeof_cp_inv); // allocate memory
        // Note that these next assignments assume that the arrays are C-style contiguous
        cp = PyArray_DATA(p);
        cp_inv = PyArray_DATA(p_inv);
        cgr_diag = PyArray_DATA(gr_diag);
        for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values
            only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need
            only_need_index++;
            if (only_need_i == -1) { // we don't need to do anything for these entries in mlist
                index += n_aa2;
            } else { // we need to compute at least some entries in mlist
                ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut));  // ut value
                for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) {
                    exp_ut = exp(ut * cgr_diag[irowcolumn]);
                    for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) {
                        cp_inv_exp[y] = cp_inv[y] * exp_ut;
                    }
                }
                if (only_need_i == -2) { // we need to compute all of these entries in mlist
#ifdef USE_ACCELERATE_CBLAS
                    // multiply the matrices using cblas_dgemm, and fill mlist with the results
                    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cp_inv_exp, n_aa, (double) 0.0, &arr_mlist[index], n_aa);
                    for (index2 = index; index2 < index + n_aa2; index2++) {
                        arr_mlist[index2] = fabs(arr_mlist[index2]);
                    }
                    index += n_aa2;
#else
                    // multiply the matrices in pure C code, and fill mlist with the results
                    for (x = 0; x < n_aa2; x += n_aa) {
                        for (y = 0; y < n_aa; y++) {
                            mxy = 0.0;
                            yindex = y;
                            for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                mxy += cp[irowcolumn] * cp_inv_exp[yindex];
                                yindex += n_aa;
                            }
                            arr_mlist[index++] = fabs(mxy);
                        }
                    }
#endif
                } else { // we need to compute entries in mlist only for x = only_need_i
                    only_need_i_n_aa = only_need_i * n_aa;
                    index += only_need_i_n_aa;
#ifdef USE_ACCELERATE_CBLAS
                    // do the matrix vector multiplication using cblas, and put results in mlist
                    cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cp_inv_exp, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_mlist[index], 1); 
                    for (index2 = index; index2 < index + n_aa2 - only_need_i_n_aa; index2++) {
                        arr_mlist[index2] = fabs(arr_mlist[index2]);
                    }
                    index += n_aa2 - only_need_i_n_aa;
#else
                    // do the matrix vector multiplication in pure C code, and put results in mlist
                    for (y = 0; y < n_aa; y++) {
                        mxy = 0.0;
                        yindex = y;
                        for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) {
                            mxy += cp[irowcolumn] * cp_inv_exp[yindex];
                            yindex += n_aa;
                        }
                        arr_mlist[index++] = fabs(mxy);
                    }
                    index += n_aa2 - only_need_i_n_aa - n_aa;
#endif
                }
            }
        }
        free(cp_inv_exp);
    } else if (array_type == NPY_CDOUBLE) { // array is complex doubles
        complex_cp_inv_exp = (complex double *) malloc(complex_sizeof_cp_inv); // allocate memory
        complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory
        complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory
        // Note that these next assignments assume that the arrays are C-style contiguous
        complex_cp = PyArray_DATA(p);
        complex_cp_inv = PyArray_DATA(p_inv);
        complex_cgr_diag = PyArray_DATA(gr_diag);
        for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values
            only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need
            only_need_index++;
            if (only_need_i == -1) { // we don't need to do anything for these entries in mlist
                index += n_aa2;
            } else { // we need to compute at least some entries in mlist
                ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut));  // ut value
                for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) {
                    complex_exp_ut = cexp(ut * complex_cgr_diag[irowcolumn]);
                    for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) {
                        complex_cp_inv_exp[y] = complex_cp_inv[y] * complex_exp_ut;
                    }
                }
                if (only_need_i == -2) { // we need to compute all of these entries in mlist
#ifdef USE_ACCELERATE_CBLAS
                    // multiply the matrices using cblas_zgemm, and fill mlist with the results
                    cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cp_inv_exp, n_aa, &complex_zero, complex_naa2_list, n_aa); 
                    for (index2 = 0; index2 < n_aa2; index2++) {
                        arr_mlist[index + index2] = cabs(complex_naa2_list[index2]);
                    }
                    index += n_aa2;
#else
                    // multiply the matrices in pure C code, and fill mlist with the results
                    for (x = 0; x < n_aa2; x += n_aa) {
                        for (y = 0; y < n_aa; y++) {
                            complex_mxy = (complex double) 0.0;
                            yindex = y;
                            for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) {
                                complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex];
                                yindex += n_aa;
                            }
                            arr_mlist[index++] = cabs(complex_mxy);
                        }
                    }
#endif
                } else { // we need to compute entries in mlist only for x = only_need_i
                    only_need_i_n_aa = only_need_i * n_aa;
                    index += only_need_i_n_aa;
#ifdef USE_ACCELERATE_CBLAS
                    // do the matrix vector multiplication using cblas, and put results in mlist
                    cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cp_inv_exp, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1);  
                    for (index2 = 0; index2 < n_aa; index2++) {
                        arr_mlist[index + index2] = cabs(complex_naa_list[index2]);
                    }
                    index += n_aa2 - only_need_i_n_aa;
#else
                    // do the matrix vector multiplication in pure C code, and put results in mlist
                    for (y = 0; y < n_aa; y++) {
                        complex_mxy = (complex double) 0.0;
                        yindex = y;
                        for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) {
                            complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex];
                            yindex += n_aa;
                        }
                        arr_mlist[index++] = cabs(complex_mxy);
                    }
                    index += n_aa2 - only_need_i_n_aa - n_aa;
#endif
                }
            }
        }
        free(complex_cp_inv_exp);
        free(complex_naa2_list);
        free(complex_naa_list);
    } else { // array is of neither real nor complex doubles
        PyErr_SetString(PyExc_ValueError, "gr entries are neither double nor complex doubles.");
        return NULL;
    }
    return PyInt_FromLong((long) 1);
}
Beispiel #12
0
static inline int
CORE_dpamm_a2(PLASMA_enum side, PLASMA_enum trans, PLASMA_enum uplo,
              int M, int N, int K, int L,
              int vi2, int vi3,
                    double *A2, int LDA2,
              const double *V, int LDV,
                    double *W, int LDW)
{

   /*
    *  A2 = A2 + op(V) * W  or  A2 = A2 + W * op(V)
    */

    int j;
    static double zone  =  1.0;
    static double mzone  =  -1.0;

    if (side == PlasmaLeft) {

        if (((trans == PlasmaTrans) && (uplo == CblasUpper)) ||
            ((trans == PlasmaNoTrans) && (uplo == CblasLower))) {

            printf("Left Upper/ConjTrans & Lower/NoTrans not implemented yet\n");
            return PLASMA_ERR_NOT_SUPPORTED;

        }
        else {  //trans

            /*
             * A2 = A2 - V * W
             */

            /* A2_1 = A2_1 - V_1  * W_1 */
            if (M > L) {
                cblas_dgemm(
                    CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans,
                    M-L, N, L,
                    (mzone), V, LDV,
                    W, LDW,
                    (zone), A2, LDA2);
            }

            /* W_1 = V_2 * W_1 */
            cblas_dtrmm(
                CblasColMajor, CblasLeft, (CBLAS_UPLO)uplo,
                (CBLAS_TRANSPOSE)trans, CblasNonUnit, L, N,
                (zone), &V[vi2], LDV,
                W, LDW);

            /* A2_2 = A2_2 - W_1 */
            for(j = 0; j < N; j++) {
                cblas_daxpy(
                    L, (mzone),
                    &W[LDW*j], 1,
                    &A2[LDA2*j+(M-L)], 1);
            }

            /* A2 = A2 - V_3  * W_2 */
            if (K > L) {
                cblas_dgemm(
                    CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans,
                    M, N, (K-L),
                    (mzone), &V[vi3], LDV,
                    &W[L], LDW,
                    (zone), A2, LDA2);
            }

        }
    }
    else { //side right

        if (((trans == PlasmaTrans) && (uplo == CblasUpper)) ||
            ((trans == PlasmaNoTrans) && (uplo == CblasLower))) {

            /*
             * A2 = A2 - W * V'
             */

            /* A2 = A2 - W_2 * V_3' */
            if (K > L) {
                cblas_dgemm(
                    CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                    M, N, K-L,
                    (mzone), &W[LDW*L], LDW,
                    &V[vi3], LDV,
                    (zone), A2, LDA2);
            }

            /* A2_1 = A2_1 - W_1 * V_1' */
            if (N > L) {
                cblas_dgemm(
                    CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                    M, N-L, L,
                    (mzone), W, LDW,
                    V, LDV,
                    (zone), A2, LDA2);
            }

            /* A2_2 =  A2_2 -  W_1 * V_2' */
            if (L > 0) {
                cblas_dtrmm(
                    CblasColMajor, CblasRight, (CBLAS_UPLO)uplo,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M, L,
                    (mzone), &V[vi2], LDV,
                    W, LDW);

                for (j = 0; j < L; j++) {
                    cblas_daxpy(
                        M, (zone),
                        &W[LDW*j], 1,
                        &A2[LDA2*(N-L+j)], 1);
                }
            }

        }
        else {
            printf("Right Upper/NoTrans & Lower/ConjTrans not implemented yet\n");
            return PLASMA_ERR_NOT_SUPPORTED;
        }
    }

    return PLASMA_SUCCESS;
}
Beispiel #13
0
static inline int
CORE_dpamm_w(PLASMA_enum side, PLASMA_enum trans, PLASMA_enum uplo,
             int M, int N, int K, int L,
             int vi2, int vi3,
             const double *A1, int LDA1,
                   double *A2, int LDA2,
             const double *V, int LDV,
                   double *W, int LDW)
{

   /*
    * W = A1 + op(V) * A2  or  W = A1 + A2 * op(V)
    */

    int j;
    static double zone  =  1.0;
    static double zzero =  0.0;

    if (side == PlasmaLeft) {

        if (((trans == PlasmaTrans) && (uplo == CblasUpper)) ||
            ((trans == PlasmaNoTrans) && (uplo == CblasLower))) {

            /*
             * W = A1 + V' * A2
             */

            /* W = A2_2 */
            LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,
                lapack_const(PlasmaUpperLower),
                L, N,
                &A2[K-L], LDA2, W, LDW);

            /* W = V_2' * W + V_1' * A2_1 (ge+tr, top L rows of V') */
            if (L > 0) {
                /* W = V_2' * W */
                cblas_dtrmm(
                    CblasColMajor, CblasLeft, (CBLAS_UPLO)uplo,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, L, N,
                    (zone), &V[vi2], LDV,
                    W, LDW);

                /* W = W + V_1' * A2_1 */
                if (K > L) {
                    cblas_dgemm(
                        CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans,
                        L, N, K-L,
                        (zone), V, LDV,
                        A2, LDA2,
                        (zone), W, LDW);
                }
            }

            /* W_2 = V_3' * A2: (ge, bottom M-L rows of V') */
            if (M > L) {
                cblas_dgemm(
                    CblasColMajor, (CBLAS_TRANSPOSE)trans, CblasNoTrans,
                    (M-L), N, K,
                    (zone), &V[vi3], LDV,
                    A2, LDA2,
                    (zzero), &W[L], LDW);
            }

            /* W = A1 + W */
            for(j = 0; j < N; j++) {
                cblas_daxpy(
                        M, (zone),
                        &A1[LDA1*j], 1,
                        &W[LDW*j], 1);
            }
        }
        else {
            printf("Left Upper/NoTrans & Lower/ConjTrans not implemented yet\n");
            return PLASMA_ERR_NOT_SUPPORTED;

        }
    }
    else { //side right

        if (((trans == PlasmaTrans) && (uplo == CblasUpper)) ||
            ((trans == PlasmaNoTrans) && (uplo == CblasLower))) {
            printf("Right Upper/ConjTrans & Lower/NoTrans not implemented yet\n");
            return PLASMA_ERR_NOT_SUPPORTED;

        }
        else {

            /*
             * W = A1 + A2 * V
             */

            if (L > 0) {

                /* W = A2_2 */
                LAPACKE_dlacpy_work(LAPACK_COL_MAJOR,
                    lapack_const(PlasmaUpperLower),
                    M, L,
                    &A2[LDA2*(K-L)], LDA2, W, LDW);

                /* W = W * V_2 --> W = A2_2 * V_2 */
                cblas_dtrmm(
                    CblasColMajor, CblasRight, (CBLAS_UPLO)uplo,
                    (CBLAS_TRANSPOSE)trans, CblasNonUnit, M, L,
                    (zone), &V[vi2], LDV,
                    W, LDW);

                /* W = W + A2_1 * V_1 */
                if (K > L) {
                    cblas_dgemm(
                        CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                        M, L, K-L,
                        (zone), A2, LDA2,
                        V, LDV,
                        (zone), W, LDW);
                }

            }

            /* W = W + A2 * V_3 */
            if (N > L) {
                cblas_dgemm(
                    CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                    M, N-L, K,
                    (zone), A2, LDA2,
                    &V[vi3], LDV,
                    (zzero), &W[LDW*L], LDW);
            }

            /* W = A1 + W */
            for (j = 0; j < N; j++) {
                cblas_daxpy(
                        M, (zone),
                        &A1[LDA1*j], 1,
                        &W[LDW*j], 1);
            }
        }
    }

    return PLASMA_SUCCESS;
}
/* Square matrix-matrix multiplication */
void matrix_multiply(int M, int N, int K, 
		int blockX_len, int blockY_len) 
{
	/* Local buffers and Global arrays declaration */
	double *a=NULL, *b=NULL, *c=NULL;

	int dims[NDIMS], ld[NDIMS], chunks[NDIMS];
	int lo[NDIMS], hi[NDIMS], cdims[NDIMS]; /* dim of blocks */

	int g_a, g_b, g_c, g_cnt, g_cnt2;
	int offset;
	double alpha = 1.0, beta=0.0;
	int count_p = 0, next_p = 0;
	int count_gac = 0, next_gac = 0;
	double t1,t2,seconds;
        ga_nbhdl_t nbh;
        int count_acc = 0;

	/* Find local processor ID and the number of processes */
	int proc=GA_Nodeid(), nprocs=GA_Nnodes();

	if ((M % blockX_len) != 0 || (M % blockY_len) != 0 || (N % blockX_len) != 0 || (N % blockY_len) != 0 
			|| (K % blockX_len) != 0 || (K % blockY_len) != 0)
		GA_Error("Dimension size M/N/K is not divisible by X/Y block sizes", 101);

	/* Allocate/Set process local buffers */
	a = malloc (blockX_len * blockY_len * sizeof(double)); 
	b = malloc (blockX_len * blockY_len * sizeof(double)); 
	c = malloc (blockX_len * blockY_len * sizeof(double));

	cdims[0] = blockX_len;
	cdims[1] = blockY_len;	

	/* Configure array dimensions */
	for(int i = 0; i < NDIMS; i++) {
		dims[i]  = N;
		chunks[i] = -1;
		ld[i]    = cdims[i]; /* leading dimension/stride of the local buffer */
	}

	/* create a global array g_a and duplicate it to get g_b and g_c*/
	g_a = NGA_Create(C_DBL, NDIMS, dims, "array A", chunks);

	if (!g_a) 
		GA_Error("NGA_Create failed: A", NDIMS);

#if DEBUG>1
	if (proc == 0) 
		printf("  Created Array A\n");
#endif
	/* Ditto for C and B */
	g_b = GA_Duplicate(g_a, "array B");
	g_c = GA_Duplicate(g_a, "array C");

	if (!g_b || !g_c) 
		GA_Error("GA_Duplicate failed",NDIMS);
	if (proc == 0) 
		printf("Created Arrays B and C\n");

	/* Subscript array for read-incr, which is nothing but proc */
	int * rdcnt = malloc (nprocs * sizeof(int));
	memset (rdcnt, 0, nprocs * sizeof(int));
	int * rdcnt2 = malloc (nprocs * sizeof(int));
	memset (rdcnt2, 0, nprocs * sizeof(int));

	/* Create global array of nprocs elements for nxtval */	
	int counter_dim[1];
	counter_dim[0] = nprocs;

	g_cnt = NGA_Create(C_INT, 1, counter_dim, "Shared counter", NULL);

	if (!g_cnt) 
		GA_Error("Shared counter failed",1);

	g_cnt2 = GA_Duplicate(g_cnt, "another shared counter");

	if (!g_cnt2) 
		GA_Error("Another shared counter failed",1);

	GA_Zero(g_cnt);
	GA_Zero(g_cnt2);

#if DEBUG>1	
	/* initialize data in matrices a and b */
	if(proc == 0)
		printf("Initializing local buffers - a and b\n");
#endif
	int w = 0; 
	int l = 7;
	for(int i = 0; i < cdims[0]; i++) {
		for(int j = 0; j < cdims[1]; j++) {
			a[i*cdims[1] + j] = (double)(++w%29);
			b[i*cdims[1] + j] = (double)(++l%37);
		}
	}

	/* Copy data to global arrays g_a and g_b from local buffers */
	next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
	for (int i = 0; i < N; i+=cdims[0]) 
	{
		if (next_p == count_p) {
			for (int j = 0; j < N; j+=cdims[1])
			{
				/* Indices of patch */
				lo[0] = i;
				lo[1] = j;
				hi[0] = lo[0] + cdims[0];
				hi[1] = lo[1] + cdims[1];

				hi[0] = hi[0]-1;
				hi[1] = hi[1]-1;
#if DEBUG>1
				printf ("%d: PUT_GA_A_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Put(g_a, lo, hi, a, ld);
				NGA_Put(g_b, lo, hi, b, ld);

			}
			next_p = NGA_Read_inc(g_cnt2,&rdcnt[proc],(long)1);
		}		
		count_p++;
	}


#if DEBUG>1
	printf ("After NGA_PUT to global - A and B arrays\n");
#endif
	/* Synchronize all processors to make sure puts from 
	   nprocs has finished before proceeding with dgemm */
	GA_Sync();

	t1 = GA_Wtime();

	next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
	for (int m = 0; m < N; m+=cdims[0])
	{
		for (int k = 0; k < N; k+=cdims[0])
		{
			if (next_gac == count_gac)	
			{
				/* A = m x k */
				lo[0] = m; lo[1] = k;
				hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];

				hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
				printf ("%d: GET GA_A: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
				NGA_Get(g_a, lo, hi, a, ld);

				for (int n = 0; n < N; n+=cdims[1])
				{
					memset (c, 0, sizeof(double) * cdims[0] * cdims[1]);
					/* B = k x n */
					lo[0] = k; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: GET_GA_B: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_Get(g_b, lo, hi, b, ld);


					//_my_dgemm_ (a, local_N, b, local_N, c, local_N, local_N, local_N, local_N, alpha, beta=1.0);

					/* TODO I am assuming square matrix blocks, further testing/work 
					   required for rectangular matrices */
					cblas_dgemm ( CblasRowMajor, CblasNoTrans, /* TransA */CblasNoTrans, /* TransB */
							cdims[0] /* M */, cdims[1] /* N */, cdims[0] /* K */, alpha,
							a, cdims[0], /* lda */ b, cdims[1], /* ldb */
							beta=1.0, c, cdims[0] /* ldc */);

					NGA_NbWait(&nbh);

					/* C = m x n */
					lo[0] = m; lo[1] = n;
					hi[0] = cdims[0] + lo[0]; hi[1] = cdims[1] + lo[1];				

					hi[0] = hi[0]-1; hi[1] = hi[1]-1;
#if DEBUG>3
					printf ("%d: ACC_GA_C: lo[0,1] = %d,%d and hi[0,1] = %d,%d\n",proc,lo[0],lo[1],hi[0],hi[1]);
#endif
					NGA_NbAcc(g_c, lo, hi, c, ld, &alpha, &nbh);
					count_acc += 1;
				} /* END LOOP N */
				next_gac = NGA_Read_inc(g_cnt,&rdcnt2[proc],(long)1);
			} /* ENDIF if count == next */
			count_gac++;
		} /* END LOOP K */
	} /* END LOOP M */

	GA_Sync();
	t2 = GA_Wtime();
	seconds = t2 - t1;
	if (proc == 0)
		printf("Time taken for MM (secs):%lf \n", seconds);

        printf("Number of ACC: %d\n", count_acc);

	/* Correctness test - modify data again before this function */
	for (int i = 0; i < NDIMS; i++) {
		lo[i] = 0;
		hi[i] = dims[i]-1;
		ld[i] = dims[i];
	}

	verify(g_a, g_b, g_c, lo, hi, ld, N);

	/* Clear local buffers */
	free(a);
	free(b);
	free(c);
	free(rdcnt);
	free(rdcnt2);

	GA_Sync();

	/* Deallocate arrays */
	GA_Destroy(g_a);
	GA_Destroy(g_b);
	GA_Destroy(g_c);
	GA_Destroy(g_cnt);
	GA_Destroy(g_cnt2);
}
Beispiel #15
0
int main(int argc, char* argv[])
{
	char dummy[L2_CACHE_SIZE];
	
// Tests de performances de ddot	
	int size = 50;

	blas_t *matriceD, *matriceE;
	alloc_vecteur(&matriceD, size);
	alloc_vecteur(&matriceE, size);

	printf("Tests de performance de la fonction ddot\n");
	perf_t *t1, *t2,*t3, *t4,*t5, *t6,*t7, *t8, *t9, *t10;
	t1 = malloc(sizeof(perf_t));
	t2 = malloc(sizeof(perf_t));
        t3 = malloc(sizeof(perf_t));
	t4 = malloc(sizeof(perf_t));
        t5 = malloc(sizeof(perf_t));
	t6 = malloc(sizeof(perf_t));
        t7 = malloc(sizeof(perf_t));
	t8 = malloc(sizeof(perf_t));
        t9 = malloc(sizeof(perf_t));
	t10 = malloc(sizeof(perf_t));
        

	double mflops, mflops1,mflops2,mflops3,mflops4, mflops5;
	char command[200];

        system("rm results/ddot_perf.txt");
	for(size = 50; size < 100000000; size += size/4)
	{
		printf("M: %d ", size);
		if(size != 50)
		{
			free(matriceD);
			free(matriceE);
			alloc_vecteur(&matriceD, size);
			alloc_vecteur(&matriceE, size);
		}
		memset(dummy, 0, sizeof(dummy));
		perf(t1);
		blas_t res = cblas_ddot(size, matriceD, 1, matriceE, 1);
		perf(t2);
		perf_diff(t1, t2);
		mflops = perf_mflops(t2, 2 * size);
                printf("Mflops/s: %le\n", mflops);

                sprintf(command, "echo %d %lf >> results/ddot_perf.txt", size, mflops);	
                system(command);
				
	}


// Test de performance dgemm
//////////////////////////////////////////

	long m = 100;

	
	blas_t *matriceA, *matriceB, *matriceC;
	
	alloc_matrice(&matriceA, m, m);
	alloc_matrice(&matriceB, m, m);
        matriceC = calloc(m*m,sizeof(blas_t));
        system("rm results/dgemm_perf.txt");

	for(; m< 1000; m+=20)
	{
            printf("M: %d ", m);
        
		if(m != 100)
		{
			free(matriceA);
			free(matriceB);
			free(matriceC);
			alloc_matrice(&matriceA, m, m);
			alloc_matrice(&matriceB, m, m);
			alloc_matrice(&matriceC, m, m);
		}
        
        memset(dummy, 0, sizeof(dummy));
		perf(t1);
        cblas_dgemm_scalaire( CblasNoTrans, CblasNoTrans ,m, m, m, 1, matriceA, m, matriceB, m, 1, matriceC, m);
		perf(t2);
		perf_diff(t1, t2);
                mflops1 = perf_mflops(t2, m * m * m * 3 + m * m );

                
        
		perf(t3);
		cblas_dgemm_scalaire1(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t4);
		perf_diff(t3, t4);
        
        
                
                mflops2 = perf_mflops(t4, m * m * m * 3);
              	perf(t5);
		cblas_dgemm_scalaire2(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t6);
		perf_diff(t5, t6);
                mflops3 = perf_mflops(t6, m * m * m * 3);
                perf(t7);
		cblas_dgemm_scalaire3(matriceC, m, matriceA, m, matriceB, m,  m);
		perf(t8);
		perf_diff(t7, t8);
                mflops4 = perf_mflops(t8, m * m * m * 3);
                
                perf(t9);
		cblas_dgemm(CblasColMajor, CblasTrans, CblasNoTrans, m, m,m, 1, matriceA, m, matriceB, m, 1, matriceC, m);
		perf(t10);
		perf_diff(t9, t10);
                mflops5 = perf_mflops(t10, m * m * m * 3);
                
                sprintf(command, "echo %d %lf %lf %lf %lf %lf >> results/dgemm_perf.txt", m * m, mflops1, mflops2, mflops3, mflops4, mflops5);	
                system(command);
                printf("Mflops/s : %d %lf %lf %lf %lf %lf\n", m * m, mflops1, mflops2, mflops3, mflops4, mflops5 );
	}


	free(matriceA);
	free(matriceB);
	free(matriceC);
	free(matriceD);
	free(matriceE);
	return EXIT_SUCCESS;
}
Beispiel #16
0
static inline void compute_gemm_blas3(CMDOptions * options,  double * C, double * A, double * B)
{
	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, options->m, options->n, options->k,
			1.0, A, options->k, B, options->n, 1.0, C, options->n);
}
Beispiel #17
0
int sscf (basis_set_t *basis, erd_t *erd_inp, double *H, double * S, double *S_sinv, int n, int n_ele, int maxit, 
	int diis_lim, double *D_old, 
	double *D_new, double *F) 
{
	double *int_buffer;
	double *tmp;
	double *tmp2;
	double *F_tt;
	double *F_t;
	double *D_t;
	double *delta_D;
	double err;
	int conv = 0;
	int iter = 0;
	double trace;
	double s;
	double c;
	double lambda;
	int i;
	int max_funcs;
	int max_buffer_dim;
	
	max_funcs =  2 * basis->max_momentum + 1;
	max_buffer_dim = max_funcs * max_funcs * max_funcs * max_funcs;
	
	int_buffer = (double *)malloc (max_buffer_dim * sizeof(double));
	tmp = (double *)malloc (n * n * sizeof(double));
	F_t = (double *)malloc (n * n * sizeof(double));
	D_t = (double *)malloc (n * n * sizeof(double));
	delta_D = (double *)malloc (n * n * sizeof(double));
	tmp2 = (double *)malloc (n * n * sizeof(double));
	F_tt = (double *)malloc (n * n * sizeof(double));
	memset (int_buffer, 0, max_buffer_dim * sizeof(double));
	memset (tmp, 0, n * n * sizeof(double));
	memset (F_t, 0, n * n * sizeof(double));
	memset (D_old, 0, n * n * sizeof(double));
	memset (D_new, 0, n * n * sizeof(double));
	memset (D_t, 0, n * n * sizeof(double));
	memset (delta_D, 0, n * n * sizeof(double));
	memset (tmp2, 0, n * n * sizeof(double));
	memset (F_tt, 0, n * n * sizeof(double));
	memcpy (F, H, n * n * sizeof(double));

	
	memcpy (F_t, H, n * n * sizeof(double));

	do {
		
#if ODA
		/*1. D <- Diagonalize(F_t) */
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, S_sinv, n, F_t, n, 0.0, tmp, n);

		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, tmp, n, S_sinv, n, 0.0, F_tt, n);
	
		/*Compute D*/
		compute_D (n, n_ele, F_tt, D_new);
	
		/*Transform D*/
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, S_sinv, n, D_new, n, 0.0, tmp, n);

		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasTrans, n, n, n,
			1.0, tmp, n, S_sinv, n, 0.0, D_new, n);
		
		/*2. conv = Check (D-D') */

		/*3. F = Fock (D)*/
		
		memcpy (F, H, n * n * sizeof(double));
		build_fock (basis, erd_inp, int_buffer, D_new, F);

		/* delta_D = D - D_t*/
		
		memset (delta_D, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, -1.0, D_t, 1, delta_D, 1);
		cblas_daxpy (n * n, 1.0, D_new, 1, delta_D, 1);

		/* s = trace(F_t * delta_D) */
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, F_t, n, delta_D, n, 0.0, tmp, n);

		
		s = compute_trace (tmp, n);
		
		/*tmp = F - F_t*/
		memset (tmp, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, -1.0, F_t, 1, tmp, 1);
		cblas_daxpy (n * n, 1.0, F, 1, tmp, 1);

		/* c = trace (tmp * delta_D) */
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, tmp, n, delta_D, n, 0.0, tmp2, n);

		
		c = compute_trace (tmp2, n);
		/* set lambda */
		
		if (c < -s/2.0) {
			lambda = 1.0;
		} else {
			lambda = -s / (2.0 * c);
		}
		memcpy (D_old, D_t, n * n * sizeof (double));
		memcpy (F_tt, F_t, n * n * sizeof (double));


		/* D_t = (1-lambda) * D_t + lambda * D */
		memset (tmp, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, (1.0 - lambda), D_t, 1, tmp, 1);
		cblas_daxpy (n * n, lambda, D_new, 1, tmp, 1);
		memset (D_t, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, 1.0, tmp, 1, D_t, 1);
		
		/* F_t = (1-lambda) * F_t + lambda * F */
		memset (tmp, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, (1.0 - lambda), F_t, 1, tmp, 1);
		cblas_daxpy (n * n, lambda, F, 1, tmp, 1);
		memset (F_t, 0, n * n * sizeof(double));
		cblas_daxpy (n * n, 1.0, tmp, 1, F_t, 1);
		
		/* print energy at each iteration */
		err = fabs (calc_hf_ene (D_new, F, H, n) - calc_hf_ene (D_old, F_tt, H, n));
		/* fprintf (stderr, "\n iteration ene %d: %lf", iter, calc_hf_ene(D_new, F, H, n)); */
		/* fprintf (stderr, "\n iteration %d: %10.6e", iter, err); */
		/* fprintf (stderr, "\n lambda %d: %lf",iter, lambda); */

		fprintf (stdout, "\n %d, %lf, %10.6e", iter, calc_hf_ene(D_new, F, H, n), err);


	
		iter++;

#endif

#if NORM
		memcpy (D_old, D_new, n * n * sizeof(double));

		/*Build F*/
		memcpy (F, H, n * n * sizeof(double));
		build_fock (basis, erd_inp, int_buffer, D_new, F);

		/*Transform F*/
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, S_sinv, n, F, n, 0.0, tmp, n);

		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, tmp, n, S_sinv, n, 0.0, F_t, n);
	
		/*Compute D*/
		compute_D (n, n_ele, F_t, D_new);
	
		/*Transform D*/
		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
			1.0, S_sinv, n, D_new, n, 0.0, tmp, n);

		cblas_dgemm (CblasColMajor, CblasNoTrans, CblasTrans, n, n, n,
			1.0, tmp, n, S_sinv, n, 0.0, D_new, n);
		
		iter++;

		/*Check energy convergence*/
		err = fabs (calc_hf_ene (D_new, F, H, n) - calc_hf_ene (D_old, F, H, n));
		/* fprintf (stderr, "\n iteration ene %d: %lf", iter, calc_hf_ene(D_new, F, H, n)); */
		/* fprintf (stderr, "\n iteration %d: %10.6e", iter, err); */
		fprintf (stdout, "\n %d, %lf, %10.6e", iter, calc_hf_ene(D_new, F, H, n), err);
		

#endif


	} while ((iter < maxit));

	fprintf (stderr, "\n Final Energy: %lf \n", calc_hf_ene (D_new, F, H, n));

	/* printmatCM ("Final D", D_new, n, n); */
	/* printmatCM ("Final F", F, n, n); */
	free (D_t);
	free (delta_D);
	free (tmp2);
	free (tmp);
	free (F_t);
	free (F_tt);
	free (int_buffer);
	return 0;
}	
/***********************************************************//**
    Perform a left-right dmrg sweep 

    \param[in]     z       - initial guess
    \param[in]     f       - specialized function to multiply core by matrix
    \param[in]     args    - arguments to f
    \param[in,out] phil    - left multipliers
    \param[in]     psir    - right multiplies
    \param[in]     epsilon - splitting tolerance
    \param[in]     opts    - approximation options

    \return na - a new approximation
***************************************************************/
struct FunctionTrain * 
dmrg_sweep_lr(struct FunctionTrain * z, 
              void (*f)(char,size_t,size_t, double *, struct Qmarray **, void *),
              void * args, struct QR ** phil, struct QR ** psir, double epsilon,
              struct MultiApproxOpts * opts)
{
    double * RL = NULL;
    size_t dim = z->dim;
    struct FunctionTrain * na = function_train_alloc(dim);
    struct OneApproxOpts * o = NULL;
    na->ranks[0] = 1;
    na->ranks[na->dim] = 1;
    
    if (phil[0] == NULL){
        struct Qmarray * temp0 = NULL;
        f('L',0,1,NULL,&temp0,args);
        /* printf("temp0 size(%zu,%zu) \n",temp0->nrows,temp0->ncols); */
        o = multi_approx_opts_get_aopts(opts,0);
        phil[0] = qr_reduced(temp0,1,o);
        qmarray_free(temp0); temp0 = NULL;
    }
    /* exit(1); */
    
    size_t nrows = phil[0]->mr;
    size_t nmult = phil[0]->mc;
    size_t ncols = psir[0]->mc;

    RL = calloc_double(nrows * ncols);
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols,
                nmult,1.0,phil[0]->mat,nrows,psir[0]->mat,nmult,0.0,RL,nrows);

    double * u = NULL;
    double * vt = NULL;
    double * s = NULL;
    /* printf("Left-Right sweep\n"); */
    /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */
    size_t rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon);
    /* printf("rank=%zu\n",rank); */
    na->ranks[1] = rank;
    na->cores[0] = qmam(phil[0]->Q,u,rank);
    

    size_t ii;
    for (ii = 1; ii < dim-1; ii++){
        /* printf("ii = %zu\n",ii); */
        double * newphi = calloc_double(rank * phil[ii-1]->mc);
        cblas_dgemm(CblasColMajor,CblasTrans,CblasNoTrans,rank,nmult,
                    nrows,1.0,u,nrows,phil[ii-1]->mat,nrows,0.0,newphi,rank);
        //struct Qmarray * temp = mqma(newphi,y->cores[ii],rank);
        //struct Qmarray * temp = qmarray_mat_kron(rank,newphi,a->cores[ii],b->cores[ii]);
        struct Qmarray * temp = NULL;
        f('L',ii,rank,newphi,&temp,args);

        qr_free(phil[ii]); phil[ii] = NULL;
        o = multi_approx_opts_get_aopts(opts,ii);
        phil[ii] = qr_reduced(temp,1,o);
        
        free(RL); RL = NULL;
        free(newphi); newphi = NULL;
        free(u); u = NULL;
        free(vt); vt = NULL;
        free(s); s = NULL;
        qmarray_free(temp); temp = NULL;

        nrows = phil[ii]->mr;
        nmult = phil[ii]->mc;
        ncols = psir[ii]->mc;

        RL = calloc_double(nrows * ncols);
        cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols,
                    nmult,1.0,phil[ii]->mat,nrows,psir[ii]->mat,nmult,0.0,RL,nrows);

        /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */
        rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon);
        /* dprint(nrows,s); */
        /* printf("rank=%zu\n",rank); */
        na->ranks[ii+1] = rank;
        na->cores[ii] = qmam(phil[ii]->Q,u,rank);
    }
    
    /* exit(1); */
    size_t kk,jj;
    for (kk = 0; kk < ncols; kk++){
        for (jj = 0; jj < rank; jj++){
            vt[kk*rank+jj] = vt[kk*rank+jj]*s[jj];
        }
    }
    
    
    na->cores[dim-1] = mqma(vt,psir[dim-2]->Q,rank);

    free(RL); RL = NULL;
    free(u); u = NULL;
    free(vt); vt = NULL;
    free(s); s = NULL;

    return na;
}
Beispiel #19
0
int pdgemm(MPI_Comm comm, int p, int q, int bm, int bn, int bk, int gm, int gn, int gk, double *a, double *b, double *c) {

  int rank, size;
  
  MPI_Comm_rank (comm, &rank);
  MPI_Comm_size (comm, &size);

  MPI_Comm comm_row, comm_col;
  CreateGemmCommGroups(p, q, comm, &comm_row, &comm_col);

  int col_rank;
  MPI_Comm_rank (comm_col, &col_rank);
  
  int
    pm = bm / p,
    pn = bn / q,
    pk = bk / q;

  int
    m = gm / bm,
    n = gn / bn,
    k = gk / bk;


  int bi, bj, bl;

  double *b_buffer = malloc(sizeof(double) * pk * pn * n);
  if (b_buffer == NULL) {
    fprintf(stderr, "Failed to malloc memory for arrays\n");
    exit(1);
  }
  
  int i, j, l;
  for ( l=0; l<p*k; l++ ) {

    for (i=0; i<n; i++ ) {
      int buffer_offset = (i*pm*pn);

      int col_J = i*q + rank/p;
      int row_I = (col_J + l) % (p*k);

      int bcast_origin_rank = row_I % p;

      if (col_rank == bcast_origin_rank) {
	bl = row_I / p;
	bj = col_J / q;

	for ( j=0; j<pk*pn; j++ )
	  b_buffer[buffer_offset + j] = b[(bl + bj * k) * (pk * pn) + j];

      }
      //printf("(%d %d) [%d %d] %d ", row_I, col_J, bl, bj, bcast_origin_rank);
      MPI_Bcast(b_buffer + buffer_offset, pk*pn, MPI_DOUBLE, bcast_origin_rank, comm_col);

    }

    /*
      printf("Rank %d ", rank);
      for ( i=0; i< pk*pn*n; i++ ) {
      printf("%2.2f ", b_buffer[i]);
      }
      printf("\n");
    */
    
    for ( bi=0; bi<m; bi++ ) {
      for ( bj=0; bj<n; bj++ ) {

	int a_offset = (bi + bj * m) * (pm * pk);
	int b_offset = bj * pk * pn;
	int c_offset = (bi + bj * m) * (pm * pn);
	
	cblas_dgemm (CblasColMajor, CblasNoTrans , CblasNoTrans, pm, pn, pk, 1.0, a + a_offset, pm, b_buffer + b_offset, pk, 1.0, c + c_offset, pm); 
      }
    }

    ShiftMPIMatrixLeft(comm_row, gm, gn, bm, bn, p, q, a);
  }

  free(b_buffer);

  return 0;
}
/***********************************************************//**
    Perform a right-left dmrg sweep as part of ft-product

    \param[in]     z       - initial guess
    \param[in]     f       - specialized function to multiply core by matrix
    \param[in]     args    - arguments to f
    \param[in,out] phil    - left multipliers
    \param[in]     psir    - right multiplies
    \param[in]     epsilon - splitting tolerance
    \param[in]     opts    - approximation options

    \return na - a new approximation
***************************************************************/
struct FunctionTrain * 
dmrg_sweep_rl(struct FunctionTrain * z, 
              void (*f)(char,size_t,size_t,double *,struct Qmarray **,void *),
              void * args, struct QR ** phil, struct QR ** psir, double epsilon,
              struct MultiApproxOpts * opts)
{
    double * RL = NULL;
    size_t dim = z->dim;
    struct FunctionTrain * na = function_train_alloc(dim);
    na->ranks[0] = 1;
    na->ranks[na->dim] = 1;
    struct OneApproxOpts * o = NULL;
    if (psir[dim-2] == NULL){
        struct Qmarray * temp0 = NULL;
        f('R',dim-1,1,NULL,&temp0,args);
        //qmarray_kron(a->cores[dim-1],b->cores[dim-1]);
        o = multi_approx_opts_get_aopts(opts,dim-1);
        psir[dim-2] = qr_reduced(temp0,0,o);
        qmarray_free(temp0); temp0 = NULL;
    }
    
    size_t nrows = phil[dim-2]->mr;
    size_t nmult = phil[dim-2]->mc;
    size_t ncols = psir[dim-2]->mc;

    RL = calloc_double(nrows * ncols);
    cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols,
                nmult,1.0,phil[dim-2]->mat,nrows,psir[dim-2]->mat,nmult,0.0,RL,nrows);

    double * u = NULL;
    double * vt = NULL;
    double * s = NULL;
    /* printf("Right-Left sweep\n"); */
    /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */
    size_t rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon);
    /* printf("rank=%zu\n",rank); */
    na->ranks[dim-1] = rank;
    na->cores[dim-1] = mqma(vt,psir[dim-2]->Q,rank); 
    

    int ii;
    for (ii = dim-3; ii > -1; ii--){
        double * newpsi = calloc_double( psir[ii+1]->mr * rank);
        //
        cblas_dgemm(CblasColMajor,CblasNoTrans,CblasTrans,
                    psir[ii+1]->mr,rank, psir[ii+1]->mc,
                    1.0,psir[ii+1]->mat,psir[ii+1]->mr,vt,rank,
                    0.0,newpsi,psir[ii+1]->mr);

        struct Qmarray * temp = NULL;
        // qmarray_kron_mat(rank,newpsi,a->cores[ii+1],b->cores[ii+1]);
        f('R',(size_t)ii+1,rank,newpsi,&temp,args);

        qr_free(psir[ii]); psir[ii] = NULL;
        o = multi_approx_opts_get_aopts(opts,(size_t)ii+1);
        psir[ii] = qr_reduced(temp,0,o);
        
        free(RL); RL = NULL;
        free(newpsi); newpsi = NULL;
        free(u); u = NULL;
        free(vt); vt = NULL;
        free(s); s = NULL;
        qmarray_free(temp); temp = NULL; 
        nrows = phil[ii]->mr;
        nmult = phil[ii]->mc;
        ncols = psir[ii]->mc;

        RL = calloc_double(nrows * ncols);

        cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,nrows,ncols,
                    nmult,1.0,phil[ii]->mat,nrows,psir[ii]->mat,nmult,0.0,RL,nrows);

        /* printf("(nrows,ncols)=(%zu,%zu), epsilon=%G\n",nrows,ncols,epsilon); */
        rank = truncated_svd(nrows,ncols,nrows,RL,&u,&s,&vt,epsilon);
        /* printf("rank=%zu\n",rank); */
        na->ranks[ii+1] = rank;
        na->cores[ii+1] = mqma(vt,psir[ii]->Q,rank); 
    }
    
    size_t kk,jj;
    for (jj = 0; jj < rank; jj++){
        for (kk = 0; kk < nrows; kk++){
            u[jj*nrows+kk] = u[jj*nrows+kk]*s[jj];
        }
    }
    
    na->cores[0] = qmam(phil[0]->Q,u,rank);

    /* exit(1); */

    free(RL); RL = NULL;
    free(u); u = NULL;
    free(vt); vt = NULL;
    free(s); s = NULL;

    return na;
}
Beispiel #21
0
int main(int argc, char **argv){

  int     iter, i,ii,j,jj,k,kk,ig,jg,kg; /* dummies                               */
  int     iterations;           /* number of times the multiplication is done     */
  double  dgemm_time,           /* timing parameters                              */
          avgtime;
  double  checksum = 0.0,       /* checksum of result                             */
          ref_checksum;
  double  epsilon = 1.e-8;      /* error tolerance                                */
  int     nthread_input,        /* thread parameters                              */
          nthread;   
  int     num_error=0;          /* flag that signals that requested and 
                                   obtained numbers of threads are the same       */
  static  
  double  * RESTRICT A,         /* input (A,B) and output (C) matrices            */
          * RESTRICT B,      
          * RESTRICT C;
  long    order;                /* number of rows and columns of matrices         */
  int     block;                /* tile size of matrices                          */
  int     shortcut;             /* true if only doing initialization              */

  printf("Parallel Research Kernels version %s\n", PRKVERSION);
  printf("OpenMP Dense matrix-matrix multiplication\n");

#if !MKL  
  if (argc != 4 && argc != 5) {
    printf("Usage: %s <# threads> <# iterations> <matrix order> [tile size]\n",*argv);
#else
  if (argc != 4) {
    printf("Usage: %s <# threads> <# iterations> <matrix order>\n",*argv);
#endif
    exit(EXIT_FAILURE);
  }

  /* Take number of threads to request from command line                          */
  nthread_input = atoi(*++argv); 

  if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) {
    printf("ERROR: Invalid number of threads: %d\n", nthread_input);
    exit(EXIT_FAILURE);
  }

  omp_set_num_threads(nthread_input);

  iterations = atoi(*++argv);
  if (iterations < 1){
    printf("ERROR: Iterations must be positive : %d \n", iterations);
    exit(EXIT_FAILURE);
  }

  order = atol(*++argv);
  if (order < 0) {
    shortcut = 1;
    order    = -order;
  } else shortcut = 0;
  if (order < 1) {
    printf("ERROR: Matrix order must be positive: %ld\n", order);
    exit(EXIT_FAILURE);
  }
  A = (double *) prk_malloc(order*order*sizeof(double));
  B = (double *) prk_malloc(order*order*sizeof(double));
  C = (double *) prk_malloc(order*order*sizeof(double));
  if (!A || !B || !C) {
    printf("ERROR: Could not allocate space for global matrices\n");
    exit(EXIT_FAILURE);
  }

  ref_checksum = (0.25*forder*forder*forder*(forder-1.0)*(forder-1.0));

  #pragma omp parallel for private(i,j) 
  for(j = 0; j < order; j++) for(i = 0; i < order; i++) {
    A_arr(i,j) = B_arr(i,j) = (double) j; 
    C_arr(i,j) = 0.0;
  }

#if !MKL
  if (argc == 5) {
         block = atoi(*++argv);
  } else block = DEFAULTBLOCK;

  #pragma omp parallel private (i,j,k,ii,jj,kk,ig,jg,kg,iter)
  {
  double * RESTRICT AA, * RESTRICT BB, * RESTRICT CC;

  if (block > 0) {
    /* matrix blocks for local temporary copies                                     */
    AA = (double *) prk_malloc(block*(block+BOFFSET)*3*sizeof(double));
    if (!AA) {
      num_error = 1;
      printf("Could not allocate space for matrix tiles on thread %d\n", 
             omp_get_thread_num());
    }
    bail_out(num_error);
    BB = AA + block*(block+BOFFSET);
    CC = BB + block*(block+BOFFSET);
  } 

  #pragma omp master 
  {
  nthread = omp_get_num_threads();

  if (nthread != nthread_input) {
    num_error = 1;
    printf("ERROR: number of requested threads %d does not equal ",
           nthread_input);
    printf("number of spawned threads %d\n", nthread);
  } 
  else {
    printf("Matrix order          = %ld\n", order);
    if (shortcut) 
      printf("Only doing initialization\n"); 
    printf("Number of threads     = %d\n", nthread_input);
    if (block>0)
      printf("Blocking factor       = %d\n", block);
    else
      printf("No blocking\n");
    printf("Block offset          = %d\n", BOFFSET);
    printf("Number of iterations  = %d\n", iterations);
    printf("Using MKL library     = off\n");
  }
  }
  bail_out(num_error); 

  if (shortcut) exit(EXIT_SUCCESS);

  for (iter=0; iter<=iterations; iter++) {

    if (iter==1) {
      #pragma omp barrier
      #pragma omp master
      {
        dgemm_time = wtime();
      }
    }


    if (block > 0) {
  
      #pragma omp for 
      for(jj = 0; jj < order; jj+=block){
        for(kk = 0; kk < order; kk+=block) {
  
          for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) 
          for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++) 
            BB_arr(j,k) =  B_arr(kg,jg);
  
          for(ii = 0; ii < order; ii+=block){
  
            for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++)
            for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++)
              AA_arr(i,k) = A_arr(ig,kg);
  
            for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) 
            for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++)
              CC_arr(i,j) = 0.0;
         
            for (kg=kk,k=0; kg<MIN(kk+block,order); k++,kg++)
            for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) 
            for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++)
              CC_arr(i,j) += AA_arr(i,k)*BB_arr(j,k);
  
            for (jg=jj,j=0; jg<MIN(jj+block,order); j++,jg++) 
            for (ig=ii,i=0; ig<MIN(ii+block,order); i++,ig++)
              C_arr(ig,jg) += CC_arr(i,j);
  
          }
        }  
      }
    }
    else {
      #pragma omp for 
      for (jg=0; jg<order; jg++) 
      for (kg=0; kg<order; kg++) 
      for (ig=0; ig<order; ig++) 
        C_arr(ig,jg) += A_arr(ig,kg)*B_arr(kg,jg);
    }

  } /* end of iterations                                                          */

  #pragma omp barrier
  #pragma omp master
  {
    dgemm_time = wtime() - dgemm_time;
  }

  } /* end of parallel region                                                     */

#else

  printf("Matrix size           = %ldx%ld\n", order, order);
  printf("Number of threads     = %d\n", nthread_input);
  printf("Using MKL library     = on\n");
  printf("Number of iterations  = %d\n", iterations);

  for (iter=0; iter<=iterations; iter++) {

    if (iter==1) dgemm_time = wtime();

    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, order, order, 
                order, 1.0, &(A_arr(0,0)), order, &(B_arr(0,0)), order, 
                1.0, &(C_arr(0,0)), order);
  }
  dgemm_time = wtime()-dgemm_time;
#endif

  for(checksum=0.0,j = 0; j < order; j++) for(i = 0; i < order; i++)
    checksum += C_arr(i,j);

  /* verification test                                                            */
  ref_checksum *= (iterations+1);

  if (ABS((checksum - ref_checksum)/ref_checksum) > epsilon) {
    printf("ERROR: Checksum = %lf, Reference checksum = %lf\n",
           checksum, ref_checksum);
    exit(EXIT_FAILURE);
  }
  else {
    printf("Solution validates\n");
#if VERBOSE
    printf("Reference checksum = %lf, checksum = %lf\n", 
           ref_checksum, checksum);
#endif
  }

  double nflops = 2.0*forder*forder*forder;
  avgtime = dgemm_time/iterations;
  printf("Rate (MFlops/s): %lf  Avg time (s): %lf\n",
         1.0E-06 *nflops/avgtime, avgtime);

  exit(EXIT_SUCCESS);

}
Beispiel #22
0
int main(int argc, char *argv[])
{

   double *a, *b, *c, *aa ;
   unsigned int n ;
   unsigned i, j, k, iInner, jInner, kInner, blockSize ;
   struct timespec ts1, ts2, ts3, ts4, ts5, ts6, ts7 ;

   printf("hello code beginning\n") ;
   n = MATSIZE ; // default settings
   blockSize = BLOCKSIZE ;
   if (argc != 3)
   {
      printf("input matrix size and blocksize\n") ;
      exit(0);
   }
   n = atoi(argv[1]) ;
   blockSize = atoi(argv[2]) ;
   printf("matrix size %d blocksize %d\n", n,blockSize) ;
   if (n%blockSize)
   {
      printf("for this simple example matrix size must be a multiple of the block size.\n  Please re-start \n") ;
      exit(0);
   }
// allocate matrices
   a = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ;
   b = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ;
   c = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ;
   aa = (double *)calloc((n+blockSize)*(n+blockSize), sizeof(double)) ;
   if (aa == NULL) // cheap check only the last allocation checked.
   {
     printf("insufficient memory \n") ;
     exit(0) ;
   }

// fill matrices

   setmat(n, n, a) ;
   setmat(n, n, aa) ;


   srand(1) ; // set random seed (change to go off time stamp to make it better

   fillmat(n,n,b) ;
   fillmat(n,n,c) ;

   current_utc_time(&ts1) ;
// multiply matrices
   abasicmm (n,n,a,b,c) ;

   current_utc_time(&ts2) ;

   setmat(n, n, a) ;

   current_utc_time(&ts3) ;

   abettermm (n,n,a,b,c) ;

   current_utc_time(&ts4) ;

   ablockmm (n, n, aa, b, c, blockSize) ;

   current_utc_time(&ts5) ;

   cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
               n, n, n, 1.0, b, n, c, n, 0.0, a, n);

   current_utc_time(&ts6) ;

   printf("matrix multplies complete \n") ; fflush(stdout) ;

/**/
   checkmatmult(n,n,a,aa) ;

   {
      double t1, t2, t3, t4, tmp ;
      t1 =  ts2.tv_sec-ts1.tv_sec;
      tmp = ts2.tv_nsec-ts1.tv_nsec;
      tmp /= 1.0e+09 ;
      t1 += tmp ;
      printf("ijk ordering basic time %lf\n",t1) ;
      t2 =  ts4.tv_sec-ts3.tv_sec;
      tmp = ts4.tv_nsec-ts3.tv_nsec;
      tmp /= 1.0e+09 ;
      t2 += tmp ;
      printf("ikj ordering bette time %lf\n",t2) ;
      t3 =  ts5.tv_sec-ts4.tv_sec;
      tmp = ts5.tv_nsec-ts4.tv_nsec;
      tmp /= 1.0e+09 ;
      t3 += tmp ;
      printf("ikj blocked time        %lf\n",t3) ;
      t4 =  ts6.tv_sec-ts5.tv_sec;
      tmp = ts6.tv_nsec-ts5.tv_nsec;
      tmp /= 1.0e+09 ;
      t4 += tmp ;
      printf("cblas_dgemm             %lf\n",t4) ;

   }

}
Beispiel #23
0
Result base_case(Problem p) {
  cblas_dgemm(CblasColMajor,CblasNoTrans,CblasNoTrans,p.m,p.n,p.k,1,p.A,p.M,p.B,p.K,0,p.C,p.CM);
  Result r = {p.m, p.n, p.CM, p.C};
  return r;
}
Beispiel #24
0
int main (int argc, char **argv )
{
   int rout=-1,info=0,m,n,k,lda,ldb,ldc;
   double A[2] = {0.0,0.0}, 
          B[2] = {0.0,0.0}, 
          C[2] = {0.0,0.0}, 
          ALPHA=0.0, BETA=0.0;
          
   if (argc > 2){
      rout = atoi(argv[1]);
      info = atoi(argv[2]);
   }
   
   if (rout == 1) {
      if (info==0) {
         printf("Checking if cblas_dgemm fails on parameter 4\n");
         cblas_dgemm( CblasRowMajor,  CblasTrans, CblasNoTrans, INVALID, 0, 0,
                   ALPHA, A, 1, B, 1, BETA, C, 1 );
      }
      if (info==1) {
         printf("Checking if cblas_dgemm fails on parameter 5\n");
         cblas_dgemm( CblasRowMajor,  CblasNoTrans, CblasTrans, 0, INVALID, 0,
                   ALPHA, A, 1, B, 1, BETA, C, 1 );
      }
      if (info==2) {
         printf("Checking if cblas_dgemm fails on parameter 9\n");
         cblas_dgemm( CblasRowMajor,  CblasNoTrans, CblasNoTrans, 0, 0, 2,
                   ALPHA, A, 1, B, 1, BETA, C, 2 );
      }
      if (info==3) {
         printf("Checking if cblas_dgemm fails on parameter 11\n");
         cblas_dgemm( CblasRowMajor,  CblasNoTrans, CblasNoTrans, 0, 2, 2,
                   ALPHA, A, 1, B, 1, BETA, C, 1 );
      }
   } else {
      if (info==0) {
         printf("Checking if F77_dgemm fails on parameter 3\n");
         m=INVALID; n=0; k=0; lda=1; ldb=1; ldc=1;
         F77_dgemm( "T", "N", &m, &n, &k,
                   &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc );
      }
      if (info==1) {
         m=0; n=INVALID; k=0; lda=1; ldb=1; ldc=1;
         printf("Checking if F77_dgemm fails on parameter 4\n");
         F77_dgemm( "N", "T", &m, &n, &k,
                   &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc );
      }
      if (info==2) {
         printf("Checking if F77_dgemm fails on parameter 8\n");
         m=2; n=0; k=0; lda=1; ldb=1; ldc=2;
         F77_dgemm( "N", "N" , &m, &n, &k,
                   &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc );
      }
      if (info==3) {
         printf("Checking if F77_dgemm fails on parameter 10\n");
         m=0; n=0; k=2; lda=1; ldb=1; ldc=1;
         F77_dgemm( "N", "N" , &m, &n, &k,
                   &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc );
      }
   }
          
   return 1;
}
Beispiel #25
0
double *matmul_aat(int n, double *b) {
    double *c = (double *) malloc(n*n*sizeof(double));
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans, n, n, n, 1.0, b, n, b, n, 0.0, c, n);
    return c;
}
Beispiel #26
0
// DGEMM way of matrix multiply using Intel MKL
// Link with Intel MKL library: With MSFT VS and Intel Composer integration: Select build components in the Project context menu.
// For command line - check out the Intel® Math Kernel Library Link Line Advisor 
void multiply5(int msize, int tidx, int numt, TYPE a[][NUM], TYPE b[][NUM], TYPE c[][NUM], TYPE t[][NUM])
{

	double alpha = 1.0, beta = 0.;
	cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,NUM,NUM,NUM,alpha,(const double *)b,NUM,(const double *)a,NUM,beta,(double *)c,NUM);
}
Beispiel #27
0
static inline void
CORE_dgetrf_rectil_update(const PLASMA_desc A, int *IPIV,
                          int column, int n1,     int n2,
                          int thidx,  int thcnt,
                          int ft,     int lt)
{
    int ld, lm, tmpM;
    int ip, j, it, i, ldft;
    double zone  = 1.0;
    double mzone = -1.0;
    double *Atop, *Atop2, *U, *L;
    int offset = A.i;

    ldft = BLKLDD(A, 0);
    Atop  = A(0, 0) + column * ldft;
    Atop2 = Atop    + n1     * ldft;

    if (thidx == 0)
    {
        /* Swap to the right */
        int *lipiv = IPIV+column;
        int idxMax = column+n1;
        for (j = column; j < idxMax; ++j, ++lipiv) {
            ip = (*lipiv) - offset - 1;
            if ( ip != j )
            {
                it = ip / A.mb;
                i  = ip % A.mb;
                ld = BLKLDD(A, it);
                cblas_dswap(n2, Atop2                     + j, ldft,
                            A(it, 0) + (column+n1)*ld + i, ld   );
            }
        }

        /* Trsm on the upper part */
        U = Atop2 + column;
        cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower,
                     CblasNoTrans, CblasUnit,
                     n1, n2, (zone),
                     Atop  + column, ldft,
                     U,              ldft );

        /* Signal to other threads that they can start update */
        CORE_dbarrier_thread( thidx, thcnt );

        /* First tile */
        L = Atop + column + n1;
        tmpM = min(ldft, A.m) - column - n1;

        /* Apply the GEMM */
        cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     tmpM, n2, n1,
                     (mzone), L,      ldft,
                     U,      ldft,
                     (zone),  U + n1, ldft );

    }
    else
    {
        ld = BLKLDD( A, ft );
        L  = A( ft, 0 ) + column * ld;
        lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb;

        U = Atop2 + column;

        /* Wait for pivoting and triangular solve to be finished
         * before to really start the update */
        CORE_dbarrier_thread( thidx, thcnt );

        /* First tile */
        /* Apply the GEMM */
        cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     lm, n2, n1,
                     (mzone), L,         ld,
                                         U,         ldft,
                     (zone),  L + n1*ld, ld );
    }

    /* Update the other blocks */
    for( it = ft+1; it < lt; it++)
    {
        ld = BLKLDD( A, it );
        L  = A( it, 0 ) + column * ld;
        lm = it == A.mt-1 ? A.m - it * A.mb : A.mb;

        /* Apply the GEMM */
        cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     lm, n2, n1,
                     (mzone), L,          ld,
                                         U,          ldft,
                     (zone),  L + n1*ld,  ld );
    }
}
Beispiel #28
0
int main(int argc, char *argv[])
{
	//MPI initialize
	MPI_Init (&argc, &argv);	
	int rank, size, master = 0;
	MPI_Comm_rank (MPI_COMM_WORLD, &rank);	
	MPI_Comm_size (MPI_COMM_WORLD, &size);
	MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);


	CheckPreprocessorMacros();

	/* -------------------------------------------------------------------- */
	/* .. Local variables. */
	/* -------------------------------------------------------------------- */
	timer_t start_t, end_t;
	const integer_t nrhs = 1;
	Error_t error;

	if(rank == master){

		fprintf(stderr, "\nShared Memory Spike Solver.\n");


		/* -------------------------------------------------------------------- */
		/* .. Load and initalize the system Ax=f. */
		/* -------------------------------------------------------------------- */
		matrix_t* A = matrix_LoadCSR("../Tests/spike/penta_15.bin");
		//matrix_t* A = matrix_LoadCSR("../Tests/pentadiagonal/large.bin");
		//matrix_t* A = matrix_LoadCSR("../Tests/dummy/tridiagonal.bin");
		matrix_PrintAsDense( A, "Original coeffient matrix" );

		// Compute matrix bandwidth
		block_t*  x = block_CreateEmptyBlock( A->n, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ );
		block_t*  f = block_CreateEmptyBlock( A->n, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ );

		block_InitializeToValue( x, __zero  ); // solution of the system
		block_InitializeToValue( f, __punit ); // rhs of the system

		start_t = GetReferenceTime();

		/* compute an optimal solving strategy */
		sm_schedule_t* S = spike_solve_analysis( A, nrhs, size-1 );

		/* create the reduced sytem in advanced, based on the solving strategy */
		matrix_t* R  = matrix_CreateEmptyReducedSystem ( S->p, S->n, S->ku, S->kl);
		block_t*  xr = block_CreateReducedRHS( S->p, S->ku, S->kl, nrhs );

		/* -------------------------------------------------------------------- */
		/* .. Factorization Phase. */
		/* -------------------------------------------------------------------- */
		for(integer_t p=0; p < S->p; p++)
		{
			sendSchedulePacked(S, p+1);
			const integer_t r0 = S->n[p];
			const integer_t rf = S->n[p+1];

			matrix_t* Aij = matrix_ExtractMatrix(A, r0, rf, r0, rf);
			sendMatrix(Aij, p+1);

			block_t*  fi  = block_ExtractBlock( f, r0, rf );
			block_t*  yi  = block_CreateEmptyBlock( rf - r0, nrhs, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ );
			block_SetBandwidthValues( fi, A->ku, A->kl );
			block_SetBandwidthValues( yi, A->ku, A->kl );

			sendBlock(fi, p+1);
			sendBlock(yi, p+1);

			/* Add the tips of the yi block to the reduced RHS */
			block_t* yit = recvBlock(p+1);
			block_t* yib = recvBlock(p+1);
			block_AddTipTOReducedRHS( p, S->ku, S->kl, xr, yit );
			block_AddTipTOReducedRHS( p, S->ku, S->kl, xr, yib );

			/* clean up */
			block_Deallocate (fi );
			block_Deallocate (yi );
			block_Deallocate (yit);
			block_Deallocate (yib);
			
			if(p == 0){
				block_t* Vi = block_CreateEmptyBlock ( rf - r0, A->ku, A->ku, A->kl, _V_BLOCK_, _WHOLE_SECTION_ );
				block_t* Bi = matrix_ExtractBlock    ( A, r0, rf, rf, rf + A->ku, _V_BLOCK_ );
				
				sendBlock(Vi, p+1);
				sendBlock(Bi, p+1);
							
				block_t* Vit = recvBlock(p+1);
				block_t* Vib = recvBlock(p+1);

				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vit );
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vib );

				block_Deallocate( Bi );
				block_Deallocate( Vi );
				block_Deallocate( Vit);
				block_Deallocate( Vib);
			}
			else if (p == ( S->p -1)){
				block_t* Wi = block_CreateEmptyBlock( rf - r0, A->kl, A->ku, A->kl, _W_BLOCK_, _WHOLE_SECTION_ );
				block_t* Ci = matrix_ExtractBlock(A, r0, rf, r0 - A->kl, r0, _W_BLOCK_ );
				sendBlock(Wi, p+1);
				sendBlock(Ci, p+1);

				block_t* Wit = recvBlock(p+1);
				block_t* Wib = recvBlock(p+1);
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wit );
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wib );

				block_Deallocate( Ci );
				block_Deallocate( Wi );
				block_Deallocate( Wit);
				block_Deallocate( Wib);
			}
			else{
				block_t* Vi = block_CreateEmptyBlock ( rf - r0, A->ku, A->ku, A->kl, _V_BLOCK_, _WHOLE_SECTION_ );
				block_t* Bi = matrix_ExtractBlock    ( A, r0, rf, rf, rf + A->ku, _V_BLOCK_ );
				sendBlock(Vi, p+1);
				sendBlock(Bi, p+1);
			
				block_t* Vit = recvBlock(p+1);
				block_t* Vib = recvBlock(p+1);			
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vit );
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Vib );

				block_Deallocate( Bi );
				block_Deallocate( Vi );
				block_Deallocate( Vit);
				block_Deallocate( Vib);
				
				block_t* Wi = block_CreateEmptyBlock( rf - r0, A->kl, A->ku, A->kl, _W_BLOCK_, _WHOLE_SECTION_ );
				block_t* Ci = matrix_ExtractBlock(A, r0, rf, r0 - A->kl, r0, _W_BLOCK_ );
				sendBlock(Wi, p+1);
				sendBlock(Ci, p+1);

				block_t* Wit = recvBlock(p+1);
				block_t* Wib = recvBlock(p+1);
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wit );
				matrix_AddTipToReducedMatrix( S->p, p, S->n, S->ku, S->kl, R, Wib );

				block_Deallocate( Ci );
				block_Deallocate( Wi );
				block_Deallocate( Wit);
				block_Deallocate( Wib);
			}
			matrix_Deallocate( Aij);
		}
		
		MPI_Barrier(MPI_COMM_WORLD);
		/* -------------------------------------------------------------------- */
		/* .. Solution of the reduced system.                                   */
		/* -------------------------------------------------------------------- */

		block_t* yr = block_CreateEmptyBlock( xr->n, xr->m, 0, 0, _RHS_BLOCK_, _WHOLE_SECTION_ );
		fprintf(stderr, "\nSolving reduced linear system\n");
		system_solve ( R->colind, R->rowptr, R->aij, yr->aij, xr->aij, R->n, xr->m);
		block_Print(yr, "Solucion del sistema reducido");


		/* Free some memory, yr and R are not needed anymore */
		block_Deallocate ( xr );
		matrix_Deallocate( R  );

		/* -------------------------------------------------------------------- */
		/* .. Backward substitution phase. */
		/* -------------------------------------------------------------------- */
		for(integer_t p=0; p < S->p; p++)
		{
			fprintf(stderr, "Processing backward solution for the %d-th block\n", p);

			/* compute the limits of the blocks */
			const integer_t obs = S->n[p];        		/* original system starting row */
			const integer_t obe = S->n[p+1];	  		/* original system ending row   */
			const integer_t rbs = S->r[p];		  		/* reduceed system starting row */
			const integer_t rbe = S->r[p+1];			/* reduced system ending row    */
			const integer_t ni  = S->n[p+1] - S->n[p]; 	/* number of rows in the block  */

			/* allocate pardiso configuration parameters */
			MKL_INT pardiso_conf[64];

			/* extract xi sub-block */
			block_t*  xi  = block_ExtractBlock(x, obs, obe );
			sendBlock(xi, p+1);

			/* extract fi sub-block */
			block_t*  fi  = block_ExtractBlock(f, obs, obe );
			sendBlock(fi, p+1);
			printf("Lets go %d\n", p);
			
			if ( p == 0 ){

				block_t* xt_next = block_ExtractBlock ( yr, rbe, rbe + S->ku[p+1]);
				sendBlock(xt_next, p+1);
				block_Deallocate (xt_next);
			}

			else if ( p == ( S->p -1)){

				block_t* xb_prev = block_ExtractBlock ( yr, rbs - S->kl[p], rbs );
				sendBlock(xb_prev, p+1);
				block_Deallocate (xb_prev);
			}

			else{
				block_t* xt_next = block_ExtractBlock ( yr, rbe, rbe + S->ku[p+1]);
				sendBlock(xt_next, p+1);
				block_Deallocate (xt_next);
				
				block_t* xb_prev = block_ExtractBlock ( yr, rbs - S->kl[p], rbs );
				sendBlock(xb_prev, p+1);
				block_Deallocate (xb_prev);
			}
			xi = recvBlock(p+1);
			block_AddBlockToRHS(x, xi, obs, obe);
			block_Deallocate    ( xi );
			block_Deallocate 	( fi );
		}
		schedule_Destroy( S );
		block_Deallocate( yr);
	
		end_t = GetReferenceTime();

		fprintf(stderr, "\nSPIKE solver took %.6lf seconds", end_t - start_t);
		block_Print( x, "Solution of the linear system");

		ComputeResidualOfLinearSystem( A->colind, A->rowptr, A->aij, x->aij, f->aij, A->n, nrhs);
	
		fprintf(stderr, "\nPARDISO REFERENCE SOLUTION...\n");
		SolveOriginalSystem( A, x, f);

		/* -------------------------------------------------------------------- */
		/* .. Clean up. */
		/* -------------------------------------------------------------------- */
		matrix_Deallocate ( A );
		block_Deallocate  ( x );
		block_Deallocate  ( f );



		/* -------------------------------------------------------------------- */
		/* .. Load and initalize the system Ax=f. */
		/* -------------------------------------------------------------------- */
		fprintf(stderr, "\nProgram finished\n");

		debug("Number of malloc() calls %d, number of free() calls %d\n", cnt_alloc, cnt_free );
	}

	else{ //WORKERS

		/* -------------------------------------------------------------------- */
		/* .. Factorization Phase. */
		/* -------------------------------------------------------------------- */
		//fprintf(stderr, "Solving %d-th block\n", p);

		sm_schedule_t* S = recvSchedulePacked(master);
		/* compute the limits of the blocks */
		integer_t p = rank -1;
		const integer_t obs = S->n[p];        		/* original system starting row */
		const integer_t obe = S->n[p+1];	  		/* original system ending row   */
		const integer_t rbs = S->r[p];		  		/* reduceed system starting row */
		const integer_t rbe = S->r[p+1];			/* reduced system ending row    */
		const integer_t ni  = S->n[p+1] - S->n[p]; 	/* number of rows in the block  */

		MKL_INT pardiso_conf[64];

		/* allocate pardiso configuration parameters */
		DirectSolverHander_t *handler = directSolver_CreateHandler();

		directSolver_Configure( handler );

		/* factorize matrix */
		matrix_t* Aij = recvMatrix(master);

		directSolver_Factorize( handler,
			Aij->n,
			Aij->nnz,
			Aij->colind,
			Aij->rowptr,
			Aij->aij, Aij->n);

		/* -------------------------------------------------------------------- */
		/* .. Solve Ai * yi = fi                                                */
		/* Extracts the fi portion from f, creates a yi block used as container */
		/* for the solution of the system. Then solves the system.              */
		/* -------------------------------------------------------------------- */
		
		/* solve the system for the RHS value */
		block_t*  fi = recvBlock(master);
		block_t*  yi = recvBlock(master);

		/* solve Ai * yi = fi */
		directSolver_SolveForRHS( handler, nrhs, yi->aij, fi->aij );

		/* Extract the tips of the yi block */
		block_t* yit = block_ExtractTip( yi, _TOP_SECTION_   , _COLMAJOR_ );
		block_t* yib = block_ExtractTip( yi, _BOTTOM_SECTION_, _COLMAJOR_ );
		sendBlock(yit, master);
		sendBlock(yib, master);

		/* clean up */
		block_Deallocate (fi );
		block_Deallocate (yi );
		block_Deallocate (yit);
		block_Deallocate (yib);

		if ( rank == 1 ){
				
			block_t* Vi = recvBlock(master);
			block_t* Bi = recvBlock(master);

			/* solve Ai * Vi = Bi */
			directSolver_SolveForRHS( handler, Vi->m, Vi->aij, Bi->aij );

			block_t* Vit = block_ExtractTip( Vi, _TOP_SECTION_, _ROWMAJOR_ );
			block_t* Vib = block_ExtractTip( Vi, _BOTTOM_SECTION_, _ROWMAJOR_ );
			sendBlock(Vit, master);
			sendBlock(Vib, master);

			block_t* Bib = block_ExtractTip( Bi, _BOTTOM_SECTION_, _COLMAJOR_ );

			//block_Deallocate( Vi );
			block_Deallocate( Bi );
			block_Deallocate( Vi);
			block_Deallocate( Vit);
			block_Deallocate( Vib);

			//Here Master Resolve Reduced System
			MPI_Barrier(MPI_COMM_WORLD);
			
			block_t* xi = recvBlock(master);
			block_t* fi = recvBlock(master);
			block_t* xt_next = recvBlock(master);
			
			/* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit  + fi */
			cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
				Bib->n,    						/* m - number of rows of A    */
				xt_next->m, 					/* n - number of columns of B */
				Bib->m,    						/* k - number of columns of A */
				__nunit,						/* alpha                      */
				Bib->aij, 						/* A block                    */
				Bib->n,    						/* lda - first dimension of A */
				xt_next->aij, 					/* B block                    */
				xt_next->n,    					/* ldb - first dimension of B */
				__punit,						/* beta                       */
				&fi->aij[ni - S->ku[p]], 		/* C block                    */
				ni ); 					 		/* ldc - first dimension of C */
			
			/* solve Ai * xi = fi */
			directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij );

			sendBlock(xi, master);

			block_Deallocate ( Bib );
			block_Deallocate ( xt_next);
			block_Deallocate ( xi );
			block_Deallocate ( fi );
		}

		else if ( rank == size -1){
			
			block_t* Wi = recvBlock(master);
			block_t* Ci = recvBlock(master);

			/* solve Ai * Wi = Ci */
			directSolver_SolveForRHS( handler, Wi->m, Wi->aij, Ci->aij );


			block_t* Wit = block_ExtractTip( Wi, _TOP_SECTION_, _ROWMAJOR_ );
			block_t* Wib = block_ExtractTip( Wi, _BOTTOM_SECTION_, _ROWMAJOR_ );
			sendBlock(Wit, master);
			sendBlock(Wib, master);
			
			block_t* Cit = block_ExtractTip(Ci, _TOP_SECTION_, _COLMAJOR_ );			
		
			block_Deallocate( Ci );
			block_Deallocate( Wi );
			block_Deallocate( Wit);
			block_Deallocate( Wib);

			//Here Master Resolve Reduced System
			MPI_Barrier(MPI_COMM_WORLD);

			block_t* xi = recvBlock(master);
			block_t* fi = recvBlock(master);
			block_t* xb_prev = recvBlock(master);
			/* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit  + fi */ 
			cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
				Cit->n,    						/* m - number of rows of A    */
				xb_prev->m, 					/* n - number of columns of B */
				Cit->m,    						/* k - number of columns of A */
				__nunit,						/* alpha                      */
				Cit->aij, 						/* A block                    */
				Cit->n,    						/* lda - first dimension of A */
				xb_prev->aij, 					/* B block                    */
				xb_prev->n,    					/* ldb - first dimension of B */
				__punit,						/* beta                       */
				fi->aij, 			 		    /* C block                    */
				ni );		 					/* ldc - first dimension of C */

			/* solve Ai * xi = fi */
			directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij );

			sendBlock(xi, master);

			block_Deallocate ( Cit );
			block_Deallocate ( xb_prev);
			block_Deallocate ( xi );
			block_Deallocate ( fi );
		}

		else{
			block_t* Vi = recvBlock(master);
			block_t* Bi = recvBlock(master);

			/* solve Ai * Vi = Bi */
			directSolver_SolveForRHS( handler, Vi->m, Vi->aij, Bi->aij );

			block_t* Vit = block_ExtractTip( Vi, _TOP_SECTION_, _ROWMAJOR_ );
			block_t* Vib = block_ExtractTip( Vi, _BOTTOM_SECTION_, _ROWMAJOR_ );
			sendBlock(Vit, master);
			sendBlock(Vib, master);

			block_t* Bib = block_ExtractTip( Bi, _BOTTOM_SECTION_, _COLMAJOR_ );

			block_Deallocate( Bi );
			block_Deallocate( Vi );
			block_Deallocate( Vit);
			block_Deallocate( Vib);

			block_t* Wi = recvBlock(master);
			block_t* Ci = recvBlock(master);

			/* solve Ai * Wi = Ci */
			directSolver_SolveForRHS( handler, Wi->m, Wi->aij, Ci->aij );

			block_t* Wit = block_ExtractTip( Wi, _TOP_SECTION_, _ROWMAJOR_ );
			block_t* Wib = block_ExtractTip( Wi, _BOTTOM_SECTION_, _ROWMAJOR_ );
			sendBlock(Wit, master);
			sendBlock(Wib, master);
			
			block_t* Cit = block_ExtractTip(Ci, _TOP_SECTION_, _COLMAJOR_ );
			
			block_Deallocate( Ci );
			block_Deallocate( Wi );
			block_Deallocate( Wit);
			block_Deallocate( Wib);

			//Here Master Resolve Reduced System
			MPI_Barrier(MPI_COMM_WORLD);
			
			block_t* xi = recvBlock(master);
			block_t* fi = recvBlock(master);
			block_t* xt_next = recvBlock(master);
			
			/* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit  + fi */
			cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
				Bib->n,    						/* m - number of rows of A    */
				xt_next->m, 					/* n - number of columns of B */
				Bib->m,    						/* k - number of columns of A */
				__nunit,						/* alpha                      */
				Bib->aij, 						/* A block                    */
				Bib->n,    						/* lda - first dimension of A */
				xt_next->aij, 					/* B block                    */
				xt_next->n,    					/* ldb - first dimension of B */
				__punit,						/* beta                       */
				&fi->aij[ni - S->ku[p]], 		/* C block                    */
				ni ); 					 		/* ldc - first dimension of C */

			directSolver_ApplyFactorToRHS( Aij->colind, Aij->rowptr, Aij->aij, xi->aij, fi->aij, Aij->n, xi->m, &pardiso_conf );

			/* solve Ai * xi = fi */
			directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij );

			block_Deallocate ( Bib );
			block_Deallocate ( xt_next); 

			block_t* xb_prev = recvBlock(master);
			/* Backward substitution, implicit scheme: xi = -1.0 * Bi * xit  + fi */ 
			cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
				Cit->n,    						/* m - number of rows of A    */
				xb_prev->m, 					/* n - number of columns of B */
				Cit->m,    						/* k - number of columns of A */
				__nunit,						/* alpha                      */
				Cit->aij, 						/* A block                    */
				Cit->n,    						/* lda - first dimension of A */
				xb_prev->aij, 					/* B block                    */
				xb_prev->n,    					/* ldb - first dimension of B */
				__punit,						/* beta                       */
				fi->aij, 			 		    /* C block                    */
				ni );		 					/* ldc - first dimension of C */

			/* solve Ai * xi = fi */
			directSolver_SolveForRHS( handler, xi->m, xi->aij, fi->aij );

			sendBlock(xi, master);

			block_Deallocate ( Cit );
			block_Deallocate ( xb_prev);
			block_Deallocate ( xi );
			block_Deallocate ( fi );
		
		}
		/* Show statistics and clean up solver internal memory */
		directSolver_ShowStatistics(handler);
		directSolver_Finalize(handler);

		schedule_Destroy  ( S );
		matrix_Deallocate(Aij);	
		
		debug("Number of malloc() calls %d, number of free() calls %d\n", cnt_alloc, cnt_free );
	}
	debug("Rank %d Finished!\n", rank);
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Finalize();
	return 0;
}
Beispiel #29
0
 void wrapper_cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
                          const int M, const int N, const int K, const double alpha,
                          const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc)
   {
              cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
   }
Beispiel #30
0
/*
 * test ga_dgemm
 * Note: - change nummax for large arrays
 *       - turn off "dgemm_verify" for large arrays due to memory 
 *         limitations, as dgemm_verify=1 for large arrays produces 
 *         segfault, dumps core,or any crap.
 */
int main(int argc, char **argv)
{
    int num_m;
    int num_n;
    int num_k;
    int i;
    int ii;
    double *h0;
    int g_c;
    int g_b;
    int g_a;
    double a;
    double t1;
    double mf;
    double avg_t[ntrans];
    double avg_mf[ntrans];
    int itime;
    int ntimes;
    int nums_m[/*howmany*/] = {512,1024};
    int nums_n[/*howmany*/] = {512,1024};
    int nums_k[/*howmany*/] = {512,1024};
    char transa[/*ntrans*/] = "ntnt";
    char transb[/*ntrans*/] = "nntt";
    char ta;
    char tb;
    double *tmpa;
    double *tmpb;
    double *tmpc;
    int ndim;
    int dims[2];
#ifdef BLOCK_CYCLIC
    int block_size[2];
#endif

#if defined(USE_ELEMENTAL)
    // initialize Elemental (which will initialize MPI)
    ElInitialize( &argc, &argv );
    ElMPICommRank( MPI_COMM_WORLD, &me );
    ElMPICommSize( MPI_COMM_WORLD, &nproc );
    // instantiate el::global array
    ElGlobalArraysConstruct_d( &eldga );
    // initialize global arrays
    ElGlobalArraysInitialize_d( eldga );
#else
    MP_INIT(argc,argv);
    if (!MA_init(MT_DBL,1,20000000)) {
        GA_Error("failed: ma_init(MT_DBL,1,20000000)",10);
    }
    GA_INIT(argc,argv);
    me = GA_Nodeid();
#endif

    h0 = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpa = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpb = (double*)malloc(sizeof(double) * nummax*nummax);
    tmpc = (double*)malloc(sizeof(double) * nummax*nummax);

    ii = 0;
    for (i=0; i<nummax*nummax; i++) {
        ii = ii + 1;
        if (ii > nummax) {
            ii = 0;
        }
        h0[i] = ii;
    }

    /* Compute times assuming 500 mflops and 5 second target time */
    /* ntimes = max(3.0d0,5.0d0/(4.0d-9*num**3)); */
    ntimes = 5;

    for (ii=0; ii<howmany; ii++) {
        num_m = nums_m[ii];
        num_n = nums_n[ii];
        num_k = nums_k[ii];
        a = 0.5/(num_m*num_n);
        if (num_m > nummax || num_n > nummax || num_k > nummax) {
            GA_Error("Insufficient memory: check nummax", 1);
        }

#ifndef BLOCK_CYCLIC
        ndim = 2;

	/*
        dims[0] = num_m;
        dims[1] = num_n;
	*/
        dims[1] = num_m;
        dims[0] = num_n;

#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_c", NULL, &g_c );
#else
        if (!((g_c = NGA_Create(MT_DBL,ndim,dims,"g_c",NULL)))) {
            GA_Error("failed: create g_c",20);
        }
#endif
	/*
        dims[0] = num_k;
        dims[1] = num_n;
	*/
        dims[1] = num_k;
        dims[0] = num_n;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_b", NULL, &g_b );
#else
        if (!((g_b = NGA_Create(MT_DBL,ndim,dims,"g_b",NULL)))) {
            GA_Error("failed: create g_b",30);
        }
#endif
	/*
        dims[0] = num_m;
        dims[1] = num_k;
	*/
        dims[1] = num_m;
        dims[0] = num_k;
#if defined(USE_ELEMENTAL)
        ElGlobalArraysCreate_d( eldga, ndim, dims, "g_a", NULL, &g_a );
#else
        if (!((g_a = NGA_Create(MT_DBL,ndim,dims,"g_a",NULL)))) {
            GA_Error("failed: create g_a",40);
        }
#endif
#else
        ndim = 2;
        block_size[0] = 128;
        block_size[1] = 128;

        dims[0] = num_m;
        dims[1] = num_n;
        g_c = GA_Create_handle();
        GA_Set_data(g_c,ndim,dims,MT_DBL);
        GA_Set_array_name(g_c,"g_c");
        GA_Set_block_cyclic(g_c,block_size);
        if (!GA_Allocate(g_c)) {
            GA_Error("failed: create g_c",40);
        }

        dims[0] = num_k;
        dims[1] = num_n;
        g_b = GA_Create_handle();
        GA_Set_data(g_b,ndim,dims,MT_DBL);
        GA_Set_array_name(g_b,"g_b");
        GA_Set_block_cyclic(g_b,block_size);
        if (!ga_allocate(g_b)) {
            GA_Error("failed: create g_b",40);
        }

        dims[0] = num_m;
        dims[1] = num_k;
        g_a = GA_Create_handle();
        GA_Set_data(g_a,ndim,dims,MT_DBL);
        GA_Set_array_name(g_a,"g_a");
        GA_Set_block_cyclic(g_a,block_size);
        if (!ga_allocate(g_a)) {
            GA_Error('failed: create g_a',40);
        }
#endif         

        /* Initialize matrices A and B */
        if (me == 0) { 
            load_ga(g_a, h0, num_m, num_k);
            load_ga(g_b, h0, num_k, num_n);
        }
#if defined(USE_ELEMENTAL)
        double zero = 0.0;
        ElGlobalArraysFill_d( eldga, g_c, &zero );
	ElGlobalArraysSync_d( eldga );
#else
        GA_Zero(g_c);
        GA_Sync();
#endif
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\nMatrix Multiplication on C = A[%ld,%ld]xB[%ld,%ld]\n",
                    (long)num_m, (long)num_k, (long)num_k, (long)num_n);
            fflush(stdout);
        }

        for (i=0; i<ntrans; i++) {
            avg_t[i]  = 0.0;
            avg_mf[i] = 0.0;
        }

        for (itime=0; itime<ntimes; itime++) {
            for (i=0; i<ntrans; i++) {
#if defined(USE_ELEMENTAL)
	        ElGlobalArraysSync_d( eldga );
#else
                GA_Sync();
#endif
                ta = transa[i];
                tb = transb[i];
                t1 = MP_TIMER();
#if defined(USE_ELEMENTAL)
		ElGlobalArraysDgemm_d( eldga, ta, tb, num_m, num_n, num_k, 1.0, g_a, g_b, 0.0, g_c );
#else
                GA_Dgemm(ta,tb,num_m,num_n,num_k,1.0, g_a, g_b, 0.0, g_c);
#endif
                t1 = MP_TIMER() - t1;
#if defined(USE_ELEMENTAL)
                if (me == 0) {
#else
                if (GA_Nodeid() == 0) {
#endif
#if defined(USE_ELEMENTAL)
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/nproc;
#else
                    mf = 2e0*num_m*num_n*num_k/t1*1e-6/GA_Nnodes();
#endif
                    avg_t[i]  = avg_t[i]+t1;
                    avg_mf[i] = avg_mf[i] + mf;
                    printf("%15s%2d: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                            "Run#", itime, t1, mf, ta, tb);
                    fflush(stdout);
                    if (dgemm_verify && itime == 0) {
                        /* recall the C API swaps the matrix order */
                        /* we swap it here for the Fortran-based verify */
                        verify_ga_dgemm(tb, ta, num_n, num_m, num_k, 1.0,
                                g_b, g_a, 0.0, g_c, tmpb, tmpa, tmpc);
                    }
                }
            }
        }
#if defined(USE_ELEMENTAL)
        if (me == 0) {
#else
        if (GA_Nodeid() == 0) {
#endif
            printf("\n");
            for (i=0; i<ntrans; i++) {
                printf("%17s: %12.4f seconds %12.1f mflops/proc  %c %c\n",
                        "Average", avg_t[i]/ntimes, avg_mf[i]/ntimes,
                        transa[i], transb[i]);
            }
            if(dgemm_verify) {
                printf("All GA_Dgemms are verified...O.K.\n");
            }
            fflush(stdout);
        }

        /*
           GA_Print(g_a);
           GA_Print(g_b);
           GA_Print(g_c);
           */
#if defined(USE_ELEMENTAL)
        ElGlobalArraysDestroy_d( eldga, g_a );
        ElGlobalArraysDestroy_d( eldga, g_b );
        ElGlobalArraysDestroy_d( eldga, g_c );
#else
        GA_Destroy(g_c);
        GA_Destroy(g_b);
        GA_Destroy(g_a);
#endif
    }

    /* ???
       format(a15, i2, ': ', e12.4, ' seconds ',f12.1, 
       .     ' mflops/proc ', 3a2)
       */
#if defined(USE_ELEMENTAL)
    if (me == 0) {
#else
    if (GA_Nodeid() == 0) {
#endif
        printf("All tests successful\n");
    }

    free(h0);
    free(tmpa);
    free(tmpb);
    free(tmpc);
#if defined(USE_ELEMENTAL)
    // call el::global arrays destructor
    ElGlobalArraysTerminate_d( eldga );
    ElGlobalArraysDestruct_d( eldga );
    ElFinalize();
#else
    GA_Terminate();
    MP_FINALIZE();
#endif
    return 0;
}


/*
 * Verify for correctness. Process 0 computes BLAS dgemm 
 * locally. For larger arrays, disbale this test as memory
 * might not be sufficient
 */
void verify_ga_dgemm(char xt1, char xt2, int num_m, int num_n, int num_k,
        double alpha, int g_a, int g_b, double beta, int g_c,
        double *tmpa, double *tmpb, double *tmpc)
{
    int i,j,type,ndim,dims[2],lo[2],hi[2];
    double abs_value;

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            tmpc[j+i*num_m] = -1.0;
            tmpa[j+i*num_m] = -2.0;
        }
    }

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_a, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_a, lo, hi, tmpa, &dims[1]);
#endif

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_b, lo, hi, tmpb, &dims[1] );
#else
    NGA_Get(g_b, lo, hi, tmpb, &dims[1]);
#endif

    /* compute dgemm sequentially */
#if defined(USE_ELEMENTAL)
    cblas_dgemm ( CblasRowMajor, ( xt1 == 'n'? CblasNoTrans: CblasTrans ), 
	    ( xt2 == 'n'? CblasNoTrans: CblasTrans ), 
	    num_m /* M */, num_n /* N */, num_k /* K */, 
	    alpha, tmpa, num_m, /* lda */ 
	    tmpb, num_k, /* ldb */ beta, 
	    tmpc, num_m /* ldc */);
#else
    xb_dgemm(&xt1, &xt2, &num_m, &num_n, &num_k,
            &alpha, tmpa, &num_m,
            tmpb, &num_k, &beta,
            tmpc, &num_m);
#endif

    /* after computing c locally, verify it with the values in g_c */

#if defined(USE_ELEMENTAL)
    ElGlobalArraysInquire_d( eldga, g_a, &ndim, dims );
#else
    NGA_Inquire(g_a, &type, &ndim, dims);
#endif
    lo[0] = 0;
    lo[1] = 0;
    hi[0] = dims[0]-1;
    hi[1] = dims[1]-1;
#if defined(USE_ELEMENTAL)
    ElGlobalArraysGet_d( eldga, g_c, lo, hi, tmpa, &dims[1] );
#else
    NGA_Get(g_c, lo, hi, tmpa, &dims[1]);
#endif

    for (i=0; i<num_n; i++) {
        for (j=0; j<num_m; j++) {
            abs_value = fabs(tmpc[j+i*num_m]-tmpa[j+i*num_m]);
            if(abs_value > 1.0 || abs_value < -1.0) {
                printf("Values are = %f %f\n",
                        tmpc[j+i*num_m], tmpa[j+i*num_m]);
                printf("Values are = %f %f\n", 
                        fabs(tmpc[j+i*num_m]-tmpa[j*i*num_m]), abs_value);
                fflush(stdout);
                GA_Error("verify ga_dgemm failed", 1);
            }
        }
    }
}

/**
 * called by process '0' (or your master process )
 */
void load_ga(int handle, double *f, int dim1, int dim2)
{
      int lo[2], hi[2];
      
      if (dim1 < 0 || dim2 < 0) {
          return;
      }

      lo[0] = 0;
      lo[1] = 0;
      hi[0] = dim1-1;
      hi[1] = dim2-1;
#if defined(USE_ELEMENTAL)
      ElGlobalArraysPut_d( eldga, handle, lo, hi, f, &dim1 );
#else
      NGA_Put(handle, lo, hi, f, &dim1);
#endif
}