Example #1
0
void test_dgemm_TT()
{
    const size_t m=2,n=3,k=4;
    double a[4*2]={
	1,2,
	3,4,
        5,6,
	7,8,
    };
    double b[3*4]={
	9,10,11,12,
	13,14,15,16,
	17,18,19,20,
    };
    double c[2*3]={
	3,2,1,
	6,5,4,
    };
    double d[2*3];
    size_t i;
    for(i=0; i<m*n; i++) d[i]=c[i];

    my_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
	    m,n,k,2.0,a,m,b,k,2.0,c,n);
    cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
	    m,n,k,2.0,a,m,b,k,2.0,d,n);
    for(i=0; i<m*n; i++){
	assert(c[i]==d[i]);
    }
}
Example #2
0
int main(void) {
	const int size[NSTEPS] = {500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000};  // specification of array sizes
	static const double epsilon = 0.00001; // maximum floating point error allowed between doubles to be considered equal
	const double flops_per_iteration = 2.0;
	
	int i;  // index variables for looping
	double time[2]; // elapsed time. 0 is test case, 1 is base case
	int mflops[2]; // calculated mflops.  0 is test case, 1 is base case
	double* a;
	double* b;
	double* ctest;
	double* cbase;
	stopwatch* sw = stopwatch_new();
	
	for(i = 0; i < NSTEPS; i++) {
	  int n = size[i];
	  int n2 = n * n;
	  a = (double*) malloc(n2*sizeof(double));
 	  b = (double*) malloc(n2*sizeof(double));
 	  ctest = (double*) malloc(n2*sizeof(double));
 	  cbase = (double*) malloc(n2*sizeof(double));
 	  rand_square_double_matrix(i+10, n, a);
 	  rand_square_double_matrix(i+11, n, b);
    zero_square_double_matrix(n, ctest);
	  zero_square_double_matrix(n, cbase);
	  	  
	  stopwatch_restart(sw);
    my_dgemm(n, a, b, ctest);
	  stopwatch_stop(sw);
    time[0] = stopwatch_time(sw);
	  
	  stopwatch_restart(sw);
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1.0, a, n, b, n, 1.0, cbase, n);
	  stopwatch_stop(sw);
	  
	  //printf("A\n");
	  //print_square_matrix(n, a);
	  //printf("B\n");
	  //print_square_matrix(n, b);	  
	  //printf("my_dgemm\n");
	  //print_square_matrix(n, ctest);
	  //printf("cdgemm\n");
	  //print_square_matrix(n, cbase);
	  
	  assert_equal(n, ctest, cbase, epsilon);
    time[1] = stopwatch_time(sw);
    mflops[0] = calc_mflops(flops_per_iteration, n, time[0]);
    mflops[1] = calc_mflops(flops_per_iteration, n, time[1]);
    	  
	  printf("%d, %5.2f, %d, %5.2f, %d\n", n, time[0], mflops[0], time[1], mflops[1]);

    free(a);
    free(b);
    free(ctest);
    free(cbase);
  }
	stopwatch_delete(sw);
	return 0;
}
Example #3
0
/* multiply two matrices (a wrapper to BLAS routine)
 * INPUT
 *  A [na1, na2]
 *  B [nb1, nb2]
 *  where (na2 == nb1)
 * OUTPUT
 *  C [na1, nb2] = A [na1, na2] . B [nb1, nb2]
 */
void
mul_matrices (const double *A, int na1, int na2,
	      const double *B, int nb1, int nb2,
	      double *C)
{
  if (na2 != nb1)
    {
      fprintf (stderr, "illeagal multiplication in mul_matrices().\n");
      exit (1);
    }
  if (A == C || B == C)
    {
      fprintf (stderr, "illeagal output pointer in mul_matrices().\n");
      exit (1);
    }

#ifdef HAVE_CBLAS_H
  /* use ATLAS' CBLAS routines */
  cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
	       na1, nb2, na2,
	       1.0, A, na2,
	       B, nb2,
	       0.0, C, nb2);

#else // !HAVE_CBLAS_H
# ifdef HAVE_BLAS_H
  /* use Fortran BLAS routines */
  dgemm_wrap (na1, nb2, na2,
	      1.0, A,
	      B,
	      0.0, C);

# else // !HAVE_BLAS_H
  /* use local BLAS routines */
  my_dgemm (na1, nb2, na2,
	    1.0, A, na2,
	    B, nb2,
	    0.0, C, nb2);

# endif // !HAVE_BLAS_H
#endif // !HAVE_CBLAS_H
}
Example #4
0
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[] ) 
{

/* deal with the underscores now so that they can be ignored after this point */
#ifdef WINDOWS
    double (*my_ddot)( int *, double *, int *, double *, int *) = ddot;
    complex16 (*my_zdot)( int *, complex16 *, int *, complex16 *, int *) = zdotu;
    void (*my_dgemm)(char *, char *, int *, int *, int *, double *, double *,
            int *, double *, int *, double *, double *, int *) = dgemm;
    void (*my_zgemm)(char *, char *, int *, int *, int *, complex16 *, complex16 *,
        int *, complex16 *, int *, complex16 *, complex16 *, int *) = zgemm;
#else
    double (*my_ddot)( int *, double *, int *, double *, int *) = ddot_;
    complex16 (*my_zdot)( int *, complex16 *, int *, complex16 *, int *) = zdotu_;
    void (*my_dgemm)(char *, char *, int *, int *, int *, double *, double *,
            int *, double *, int *, double *, double *, int *) = dgemm_;
    void (*my_zgemm)(char *, char *, int *, int *, int *, complex16 *, complex16 *,
            int *, complex16 *, int *, complex16 *, complex16 *, int *) = zgemm_;
#endif

    mwSize M, N, K, K2;
    mwSize nOmega1, nOmega2, nOmega;
    mwIndex i,j,k,m;
    double *U, *V, *output, *outputBLAS;
    double *U_imag, *V_imag, *output_imag; /* for complex data */
    complex16 *U_cplx, *V_cplx, temp_cplx; /* for complex data */
    double *omega, *omegaX, *omegaY;
    mwIndex *omegaI, *omegaJ;  /* for sparse version */
    int SPARSE = false;
    int COMPLEX = false;
    int USE_BLAS = false;
    int LARGE_BIT = false;
    int MODE_THREE = false;
    complex16 alpha_cplx, beta_cplx;
    complex16 *output_cplx;

    int one = 1;

    char transA = 'N', transB = 'T';
    mwSize LDA, LDB;
    double alpha, beta;
    double BLAS_CUTOFF;
    
    /* Check for proper number of input and output arguments */    
    if ( (nrhs < 3) || (nrhs > 5) ) {
        printUsage();
        mexErrMsgTxt("Three to five input argument required.");
    } 
    if(nlhs > 1){
        printUsage();
        mexErrMsgTxt("Too many output arguments.");
    }
    
    /* Check data type of input argument  */
    if (!(mxIsDouble(prhs[0])) || !((mxIsDouble(prhs[1]))) ){
        printUsage();
        mexErrMsgTxt("Input arguments wrong data-type (must be doubles).");
    }   

    /* Get the size and pointers to input data */
    /* TRANSPOSE VERSION: switch mxGetM and mxGetN */
    M  = mxGetN(prhs[0]);
    K  = mxGetM(prhs[0]);
    N  = mxGetN(prhs[1]);
    K2  = mxGetM(prhs[1]);
    if ( K != K2 ) {
        printUsage();
        mexErrMsgTxt("Inner dimension of U and V' must agree.");
    }
    COMPLEX = (( (mxIsComplex(prhs[0])) ) || (mxIsComplex(prhs[1])) );
    nOmega1 = mxGetM( prhs[2] );
    nOmega2 = mxGetN( prhs[2] );


    /* on 64-bit systems, these may be longs, but I really want
     * them to be ints so that they work with the BLAS calls */
    mxAssert(M<INT_MAX,"Matrix is too large for 32-bit FORTRAN");
    mxAssert(N<INT_MAX,"Matrix is too large for 32-bit FORTRAN");
    mxAssert(K<INT_MAX,"Matrix is too large for 32-bit FORTRAN");
    
    
    if ( (nOmega1 != 1) && (nOmega2 != 1) ) {
/*         mexErrMsgTxt("Omega must be a vector"); */
        /* Update:
         * if this happens, we assume Omega is really a sparse matrix
         * and everything is OK */
        if ( ( nOmega1 != M ) || ( nOmega2 != N ) || (!mxIsSparse( prhs[2] )) ){
            printUsage();
            mexErrMsgTxt("Omega must be a vector or a sparse matrix");
        }
        nOmega = mxGetNzmax( prhs[2] );
        SPARSE = true;
    } else {
        nOmega = nOmega1 < nOmega2 ? nOmega2 : nOmega1;
        if ( nOmega > N*M ) {
            printUsage();
            mexErrMsgTxt("Omega must have M*N or fewer entries");
        }
    }

    U = mxGetPr(prhs[0]);
    V = mxGetPr(prhs[1]);
    if (COMPLEX) {
        U_imag = mxGetPi(prhs[0]);
        V_imag = mxGetPi(prhs[1]);
        plhs[0] = mxCreateDoubleMatrix(nOmega, 1, mxCOMPLEX);
        output = mxGetPr(plhs[0]);
        output_imag = mxGetPi(plhs[0]);

        U_cplx = (complex16 *)mxMalloc( M*K*sizeof(complex16) );
        V_cplx = (complex16 *)mxMalloc( N*K*sizeof(complex16) );
        if ( (U_cplx == NULL) || ( V_cplx == NULL ) ) {
            /* this should be very rare */
            mexErrMsgTxt("Unable to allocate memory.  Matrix too large?");
        }
        

        for ( i=0 ; i < M*K ; i++ ){
            U_cplx[i].re = (double)U[i];
            U_cplx[i].im = (double)U_imag[i];
        }
        for ( i=0 ; i < N*K ; i++ ){
            V_cplx[i].re = (double)V[i];
            V_cplx[i].im = -(double)V_imag[i]; 
            /* minus sign, since adjoint, not transpose */
        }
        
    } else {
        plhs[0] = mxCreateDoubleMatrix(nOmega, 1, mxREAL);
        /* mxCreateDoubleMatrix automatically zeros out the entries */
        output = mxGetPr(plhs[0]);
    }

    /* Check for the optional input argument that specifies the cutoff
     * for level-3 BLAS */
    BLAS_CUTOFF = 0.4;  
    MODE_THREE = false;
    if ( nrhs > 4 ) {
        BLAS_CUTOFF = (double)*mxGetPr( prhs[4] );
        MODE_THREE = true;
    } else if (nrhs > 3 ) {
        /* decide if this third argument is BLAS_CUTOFF,
         * or omegaX */
        nOmega1 = mxGetM( prhs[3] );
        nOmega2 = mxGetN( prhs[3] );
        if ( (nOmega1 == 1) && (nOmega2 == 1) ) {
            BLAS_CUTOFF = (double)*mxGetPr( prhs[3] );
            MODE_THREE = false;
        } else {
            MODE_THREE = true;
        }
    }
    /* Decide if we'll make a level-3 BLAS call */
    USE_BLAS = ( nOmega >= BLAS_CUTOFF * (M*N) );



    /* 
     * We have 3 "modes":
     * Mode 1 
     *      if omega is a vector of linear indices
     * Mode 2
     *      if omega is given only by the nonzero elements of an input 
     *      sparse matrix Y
     * Mode 3
     *      if omega is given as a set of subscripts, i.e. omegaX, omegaY
     *      then the "for" loop is slightly different
     *
     *  October 29, 2009: changing this a bit.
     *      For any of the modes, we first check if length(omega) > .8*M*N
     *      If so, we make a level-3 BLAS call (dgemm), and then use this
     *      matrix as needed.
     *      
     */


    /* determine if we're on a 64-bit processor */
    LARGE_BIT = ( sizeof( size_t ) > 4 );



    if ( USE_BLAS ) {
        /* we need to compute A itself, so use level-3 BLAS */
        /* TRANSPOSE VERSION: switch N and T below, and change the step size and LDA */
        /*
        transA = 'N';
        transB = 'T';
        LDA = M;
        LDB = N; */
        transA = 'T';
        transB = 'N';
        LDA = K;
        LDB = K;

        if (COMPLEX) {
            alpha_cplx.re = 1.0;  alpha_cplx.im = 0.0;
            beta_cplx.re = 0.0;   beta_cplx.im  = 0.0;
            /* need to make a new complex data structure */
            output_cplx = (complex16 *) mxMalloc( M*N*sizeof(complex16) );
            if ( output_cplx == NULL ) {
                /* we don't have enough RAM available */
                USE_BLAS = false;
            } else {
                my_zgemm(&transA,&transB,(int*)&M,(int*)&N,(int*)&K,
                    &alpha_cplx,U_cplx,(int*)&LDA,V_cplx,(int*)&LDB,&beta_cplx,output_cplx,(int *)&M );
            }
        } else {
            outputBLAS = (double *)mxMalloc( M*N*sizeof(double) );
            if ( outputBLAS == NULL ) {
                /* we don't have enough RAM available */
                USE_BLAS = false;
            } else {
                alpha = 1.0;
                beta = 0.0;
                my_dgemm(&transA,&transB,(int*)&M,(int*)&N,(int*)&K,
                        &alpha,U,(int*)&LDA,V,(int*)&LDB,&beta,outputBLAS,(int*)&M );
            }
        }
    }



    /* --- MODE 1 ---*/
    if ( (!MODE_THREE) && (!SPARSE) ) {
         /* by default, make output the same shape (i.e. row- or column-
          * vector) as the input "omega" */
        mxSetM( plhs[0], mxGetM( prhs[2] ) );
        mxSetN( plhs[0], mxGetN( prhs[2] ) );
        omega = mxGetPr(prhs[2]);

        if ( !USE_BLAS ) {
            
            if ( (COMPLEX) && (LARGE_BIT) ) {
              /* TRANSPOSE VERSION: this needs to be tested!!! */
              for ( k=0 ; k < nOmega ; k++ ){
                i = (mwIndex) ( (mwIndex)(omega[k]-1) % M);
                j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M);
                
                /* implement the BLAS call myself, since BLAS isn't working for me
                 * ZDOTU(N,ZX,INCX,ZY,INCY)  
                 * */
                temp_cplx.re = 0.0;
                temp_cplx.im = 0.0;
                for ( m=0 ; m < K ; m++ ){
                    /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re
                        - U_cplx[i+m*M].im * V_cplx[j+m*N].im;
                    temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re
                        + U_cplx[i+m*M].re * V_cplx[j+m*N].im; */
                    temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re
                        - U_cplx[K*i+m].im * V_cplx[K*j+m].im;
                    temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re
                        + U_cplx[K*i+m].re * V_cplx[K*j+m].im;
                }
                output[k] = temp_cplx.re;
                output_imag[k] = temp_cplx.im;
                
              }
            } else if (COMPLEX) {
              /* TRANSPOSE VERSION: UPDATED */
              for ( k=0 ; k < nOmega ; k++ ){
                i = (mwIndex) ( (mwIndex)(omega[k]-1) % M);
                j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M);
                /*temp_cplx = my_zdot( (int*) &K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */
                temp_cplx = my_zdot( (int*) &K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one ); 
                output[k] = temp_cplx.re;
                output_imag[k] = temp_cplx.im;
              }

            } else {
              /* TRANSPOSE VERSION: UPDATED */
              for ( k=0 ; k < nOmega ; k++ ){
                /* don't forget that Matlab is 1-indexed, C is 0-indexed */
                i = (mwIndex) ( (mwIndex)(omega[k]-1) % M);
                j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M);
                /*output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N );*/
                output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one );
              }
            }
        } else {  /* (USE_BLAS) is true */
            /* We already have the full matrix, so just find the entries */
            if (COMPLEX) {
                for ( k=0 ; k < nOmega ; k++ ){
                    output[k] = output_cplx[ (mwIndex)omega[k] -1 ].re;
                    output_imag[k] = output_cplx[ (mwIndex)omega[k] - 1 ].im;
                }
                mxFree( output_cplx );
            } else {
                for ( k=0 ; k < nOmega ; k++ ){
                    /*
                    i = (mwIndex) ( (mwIndex)(omega[k]-1) % M);
                    j = (mwIndex) ( (mwIndex)(omega[k]-1)/ M);
                    output[k] = outputBLAS( j*M + i );
                    */
                    /* This now simplifies a lot! */
                    output[k] = outputBLAS[ (mwIndex)omega[k] - 1 ];
                }
                mxFree( outputBLAS );
            }
        }

    } else {

        /* ----------- MODE 2 ------------------------------- */

        if (SPARSE) {
            /* sparse array indices in Matlab are rather confusing;
             * see the mxSetJc help file to get started.  The Ir index
             * is straightforward: it contains rows indices of nonzeros,
             * in column-major order.  But the Jc index is tricky... 
             * Basically, Jc (which has N+1 entries, not nnz entries like Ir)
             * tells you which Ir entries correspond to the jth row, thus fully 
             * specifying the indices.  Ir[ Jc[j]:Jc[J+1] ] are the rows
             * that correspond to column j. This works because Ir is
             * in column-major order.   For this to work (and match A(omega)),
             * we need omega to be sorted!  
             *Note: everything is already 0-based, not 1-based
             * */
            omegaI = mxGetIr( prhs[2] );
            omegaJ = mxGetJc( prhs[2] );
            
            if ( USE_BLAS ) {
                /* We already have the full matrix, so just find the entries */
                if (COMPLEX) {
                    for ( j=0 ; j < N ; j++ ){
                        for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) {
                            i = omegaI[k];
                            output[k] = output_cplx[ j*M + i ].re;
                            output_imag[k] = output_cplx[ j*M + i ].im;
                        }
                    }
                    mxFree( output_cplx );
                } else {
                    for ( j=0 ; j < N ; j++ ){
                        for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) {
                            i = omegaI[k];
                            output[k] = outputBLAS[ j*M + i ];
                        }
                    }
                    mxFree( outputBLAS );
                }

            } else if ((COMPLEX)&&(LARGE_BIT)) {
                /* TRANSPOSE VERSION: this needs to be tested!!! */
                for ( j=0 ; j < N ; j++ ){
                    for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) {
                        i = omegaI[k];

                        temp_cplx.re = 0.0;
                        temp_cplx.im = 0.0;
                        for ( m=0 ; m < K ; m++ ){
                            /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re
                                - U_cplx[i+m*M].im * V_cplx[j+m*N].im;
                            temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re
                                + U_cplx[i+m*M].re * V_cplx[j+m*N].im;*/
                            temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re
                                - U_cplx[K*i+m].im * V_cplx[K*j+m].im;
                            temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re
                                + U_cplx[K*i+m].re * V_cplx[K*j+m].im;
                        }
                        output[k] = temp_cplx.re;
                        output_imag[k] = temp_cplx.im;
                    }
                }
            } else if (COMPLEX) {
                /* TRANSPOSE VERSION: UPDATED */
                for ( j=0 ; j < N ; j++ ){
                    for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) {
                        i = omegaI[k];
                        /* temp_cplx = my_zdot((int*) &K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */
                        temp_cplx = my_zdot((int*) &K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one );
                        output[k] = temp_cplx.re;
                        output_imag[k] = temp_cplx.im;
                    }
                }
            } else {
                /* simple case */
                /* TRANSPOSE VERSION: UPDATED */
                for ( j=0 ; j < N ; j++ ){
                    for ( k = omegaJ[j] ; k < omegaJ[j+1] ; k++ ) {
                        i = omegaI[k];
                        /*output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N );*/
                        output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one );
                    }
                }
            }
        /* ----------- MODE 3 ------------------------------- */
        } else {
            /* we have omegaX and omegaY, the row and column indices */
            nOmega1 = mxGetM( prhs[3] );
            nOmega2 = mxGetN( prhs[3] );
            if ( (nOmega1 != 1) && (nOmega2 != 1) ) {
                printUsage();
                mexErrMsgTxt("OmegaY must be a vector");
            }
            nOmega1 = nOmega1 < nOmega2 ? nOmega2 : nOmega1;
            if ( nOmega1 != nOmega ) {
                printUsage();
    mexErrMsgTxt("In subscript index format, subscript vectors must be same length.");
            }
            omegaX = mxGetPr(prhs[2]);
            omegaY = mxGetPr(prhs[3]);

            if ( USE_BLAS ) {
                /* We already have the full matrix, so just find the entries */
                if (COMPLEX) {
                    for ( k=0 ; k < nOmega ; k++ ){
                        i = omegaX[k] - 1;
                        j = omegaY[k] - 1;
                        output[k] = output_cplx[ j*M + i ].re;
                        output_imag[k] = output_cplx[ j*M + i ].im;
                    }
                    mxFree( output_cplx );
                } else {
                    for ( k=0 ; k < nOmega ; k++ ){
                        i = omegaX[k] - 1;
                        j = omegaY[k] - 1;
                        output[k] = outputBLAS[ j*M + i ];
                    }
                    mxFree( outputBLAS );
                }

            } else if ((COMPLEX)&&(LARGE_BIT)) { 
                /* TRANSPOSE VERSION: this needs to be tested!!! */
                for ( k=0 ; k < nOmega ; k++ ){
                    i = omegaX[k] - 1;
                    j = omegaY[k] - 1;
                    temp_cplx.re = 0.0;
                    temp_cplx.im = 0.0;
                    for ( m=0 ; m < K ; m++ ){
                        /*temp_cplx.re += U_cplx[i+m*M].re * V_cplx[j+m*N].re
                            - U_cplx[i+m*M].im * V_cplx[j+m*N].im;
                        temp_cplx.im += U_cplx[i+m*M].im * V_cplx[j+m*N].re
                            + U_cplx[i+m*M].re * V_cplx[j+m*N].im; */
                        temp_cplx.re += U_cplx[K*i+m].re * V_cplx[K*j+m].re
                            - U_cplx[K*i+m].im * V_cplx[K*j+m].im;
                        temp_cplx.im += U_cplx[K*i+m].im * V_cplx[K*j+m].re
                            + U_cplx[K*i+m].re * V_cplx[K*j+m].im;
                    }
                    output[k] = temp_cplx.re;
                    output_imag[k] = temp_cplx.im;
                }
            } else if (COMPLEX) {
                /* TRANSPOSE VERSION: UPDATED */
                for ( k=0 ; k < nOmega ; k++ ){
                    i = omegaX[k] - 1;
                    j = omegaY[k] - 1;
                    /* temp_cplx = my_zdot( (int*)&K, U_cplx+i, (int*)&M, V_cplx+j, (int*)&N ); */
                    temp_cplx = my_zdot( (int*)&K, U_cplx+i*K, (int*)&one, V_cplx+j*K, (int*)&one );
                    output[k] = temp_cplx.re;
                    output_imag[k] = temp_cplx.im;
                }
            } else {
                /* TRANSPOSE VERSION: UPDATED */
                for ( k=0 ; k < nOmega ; k++ ){
                    i = omegaX[k] - 1;
                    j = omegaY[k] - 1;
                    /* output[k] = my_ddot( (int*)&K, U+i, (int*)&M, V+j, (int*)&N ); */
                    output[k] = my_ddot( (int*)&K, U+i*K, (int*)&one, V+j*K, (int*)&one );
                }
            }

        }
    }

}
Example #5
0
//calculates C = alpha * AB + beta * C
void testsize(int M, int K, int N) {

    double * A = (double*) malloc(sizeof(double) * M * K);
    double * B = (double*) malloc(sizeof(double) * K * N);
    double * C = (double*) malloc(sizeof(double) * M * N);
    double * C2 = (double*) malloc(sizeof(double) * M * N);
    double * C3 = (double*) malloc(sizeof(double) * M * N);


    for(int m = 0; m < M; m++) {
        for(int k = 0; k < K; k++) {
            A[m*K + k] = rand() % 10;
        }
    }
    int i = 0;
    for(int k = 0; k < K; k++) {
        for(int n = 0; n < N; n++) {
            B[k*N + n] = rand() % 10;
        }
    }

    for(int i = 0; i < M *N; i++)
        C[i] = C2[i] = C3[i] = -1.f;


    int times = 0;
    double blastime;
    while(CalcTime(&times,&blastime))
        cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, M,N,K,1.f,A,K,B,N,0.f,C,N);
    //naive_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, M,N,K,1.f,A,K,B,N,0.f,C,N);

    //double begin2 = CurrentTimeInSeconds();
    //naive_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans, M,N,K,1.f,A,K,B,N,0.f,C2,N);

    double mytime;
    times = 0;
    while(CalcTime(&times,&mytime))
        my_dgemm(CurrentTimeInSeconds,M,N,K,1.f,A,K,B,N,0.f,C3,N);

    /*
    for(int m = 0; m < M; m++) {
    	for(int n = 0; n < N; n++) {
    		printf("%f ",C[m*N + n]);
    	}
    	printf("\n");
    }
    printf("\n\n");
    for(int m = 0; m < M; m++) {
    	for(int n = 0; n < N; n++) {
    		printf("%f ",C2[m*N + n]);
    	}
    	printf("\n");
    }*/

    //asserteq(C,C2,M,N);
    asserteq(C,C3,M,N);

    free(C);
    free(C2);
    free(C3);
    free(A);
    free(B);
    double logblastime = log(M) + log(N) + log(K) + log(2) + log(1e-9) - log(blastime);
    double logmytime = log(M) + log(N) + log(K) + log(2) + log(1e-9) - log(mytime);
    printf("%d %d %d %f %f %f\n",M,K,N,exp(logblastime),exp(logmytime), mytime/ blastime);
}