void bi::marginalise(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2,M2>& p2, const M3 C, const ExpGaussianPdf<V4, M4>& q2, ExpGaussianPdf<V5,M5>& p3) { /* pre-conditions */ BI_ASSERT(q2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); typename sim_temp_matrix<M1>::type A1(p2.size(), p2.size()); typename sim_temp_matrix<M1>::type A2(p2.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + * \mathcal{K}(\boldsymbol{\mu}_3 - \boldsymbol{\mu}_2)\,,\f] */ z2 = q2.mean(); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 + \mathcal{K}(\Sigma_3 - * \Sigma_2)\mathcal{K}^T \\ * &=& \Sigma_1 + \mathcal{K}\Sigma_3\mathcal{K}^T - * \mathcal{K}\Sigma_2\mathcal{K}^T\,. * \f} */ p3.cov() = p1.cov(); A1 = K; trmm(1.0, q2.std(), A1, 'R', 'U', 'T'); syrk(1.0, A1, 1.0, p3.cov(), 'U'); A2 = K; trmm(1.0, p2.std(), A2, 'R', 'U', 'T'); syrk(-1.0, A2, 1.0, p3.cov(), 'U'); /* make sure correct log-variables set */ p3.setLogs(p2.getLogs()); p3.init(); // redo precalculations }
void test_product_syrk() { for(int i = 0; i < g_repeat ; i++) { int s; s = internal::random<int>(1,320); CALL_SUBTEST_1( syrk(MatrixXf(s, s)) ); s = internal::random<int>(1,320); CALL_SUBTEST_2( syrk(MatrixXd(s, s)) ); s = internal::random<int>(1,200); CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) ); s = internal::random<int>(1,200); CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) ); } }
int main() { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* C; DATA_TYPE* D; A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); D = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); fprintf(stdout, "<< Symmetric rank-k operations >>\n"); init_arrays(A, C, D); syrkGPU(A, D); t_start = rtclock(); syrk(A, C); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(C, D); free(A); free(C); free(D); return 0; }
// assume that A, which is triangular, is stored in recursive L; that means that the square block is stored in recursive backwards N void chol( double *A, int n ) { // base case if( n <= nmin ) { // probably we want to copy into full, since there doesn't seem to be a blocked packed cholesky in lapack; but the easy version for now int info = 0; //char L = 'L'; //dpotrf_( &L, &size, Afull, &size, &info); //dpptrf_( &L, &n, A, &info); //A[0] = sqrt(A[0]); // this uses the unpacked, but blocked version. double *temp = (double*) malloc( n*n*sizeof(double) ); double *Ap = A; for( int c = 0; c < n; c++ ) for( int r = c; r < n; r++ ) temp[c*n+r] = *(Ap++); char L = 'L', N = 'N'; double none = -1., one = 1.; dpotrf_( &L, &n, temp, &n, &info); Ap = A; for( int c = 0; c < n; c++ ) for( int r = c; r < n; r++ ) *(Ap++) = temp[c*n+r]; free(temp); return; } int nhalf = n/2; double *A11 = A; double *A21 = A+nhalf*(nhalf+1)/2; double *A22 = A21+nhalf*nhalf; chol(A11,nhalf); trsm(A21,A11,nhalf); syrk(A22,A21,nhalf); chol(A22,nhalf); }
void test_product_syrk() { for(int i = 0; i < g_repeat ; i++) { int s; s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE); CALL_SUBTEST_1( syrk(MatrixXf(s, s)) ); CALL_SUBTEST_2( syrk(MatrixXd(s, s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2); CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) ); CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) } }
void syrkDFS( double *C, double *A, int n, int x, int r, double alpha ) { int nhalf = n/2; int nOldTri = getSizeTri(r-1,x); int nOldSq = getSizeSq(r-1,x); double *C11 = C; double *C21 = C + nOldTri; double *C22 = C21 + nOldSq; double *A11 = A; double *A21 = A+nOldSq; double *A12 = A21+nOldSq; double *A22 = A12+nOldSq; syrk( C11, A11, nhalf, x, r-1, alpha ); syrk( C11, A12, nhalf, x, r-1, 1. ); mult( C21, A21, A11, nhalf, x, r-1, alpha ); mult( C21, A22, A12, nhalf, x, r-1, 1. ); syrk( C22, A21, nhalf, x, r-1, alpha ); syrk( C22, A22, nhalf, x, r-1, 1. ); }
void bi::cov(const M1 X, const V1 mu, M2 Sigma) { /* pre-conditions */ BI_ASSERT(X.size2() == mu.size()); BI_ASSERT(Sigma.size1() == mu.size() && Sigma.size2() == mu.size()); const int N = X.size1(); typename sim_temp_matrix<M2>::type Y(X.size1(), X.size2()); Y = X; sub_rows(Y, mu); syrk(1.0/(N - 1.0), Y, 0.0, Sigma, 'U', 'T'); }
int main( int argc, char **argv ) { initCommunication( &argc, &argv ); // make up a simple test int size = read_int( argc, argv, "-s", 8 ); int r = read_int( argc, argv, "-r", 2 ); int P; MPI_Comm_size( MPI_COMM_WORLD, &P ); initSizes( P, r, size ); if( getRank() == 0 ) { if( P > (1<<r) ) printf("Need more recursive steps for this many processors\n"); if( P > (size/(1<<r))*(size/(1<<r)+1)/2) printf("Need a bigger matrix/fewer recursive steps for this many processors\n"); printf("-s %d -r %d -n %d\n", size, r, P); } int sizeSq = getSizeSq(r,P); int sizeTri = getSizeTri(r,P); double *X = (double*) malloc( sizeSq*sizeof(double) ); srand48(getRank()); fill(X,sizeSq); double *A = (double*) malloc( sizeTri*sizeof(double) ); if( getRank() == 0 ) printf("Generating a symmetric positive definite test matrix\n"); initTimers(); MPI_Barrier( MPI_COMM_WORLD ); double st2 = read_timer(); syrk( A, X, size, P, r, 0. ); MPI_Barrier( MPI_COMM_WORLD ); double et2 = read_timer(); if( getRank() == 0 ) printf("Generation time: %f\n", et2-st2); initTimers(); free(X); for( int i = 0; i < sizeTri; i++ ) A[i] = -A[i]; if( getRank() == 0 ) printf("Starting benchmark\n"); MPI_Barrier( MPI_COMM_WORLD ); double startTime = read_timer(); chol( A, size, P, r ); MPI_Barrier( MPI_COMM_WORLD ); double endTime = read_timer(); if( getRank() == 0 ) printf("Time: %f Gflop/s %f\n", endTime-startTime, size*1.*size*size/3./(endTime-startTime)/1.e9); free(A); printCounters(size); MPI_Finalize(); }
int main() { double t_start, t_end; init_arrays(); syrkGPU(); t_start = rtclock(); syrk(); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(); return 0; }
// computes C -= A*A^t, where C is symmetric, half stored, A is general void syrk( double *C, double *A, int n ) { // base case if( n <= nmin ) { double *temp = (double*) malloc( n*n*sizeof(double) ); double *Cp = C; for( int c = 0; c < n; c++ ) for( int r = c; r < n; r++ ) temp[c*n+r] = *(Cp++); char L = 'L', N = 'N'; double none = -1., one = 1.; dsyrk_(&L, &N, &n, &n, &none, A, &n, &one, temp, &n); Cp = C; for( int c = 0; c < n; c++ ) for( int r = c; r < n; r++ ) *(Cp++) = temp[c*n+r]; free(temp); //C[0] -= A[0]*A[0]; return; } int nhalf = n/2; double *C11 = C; double *C21 = C + nhalf*(nhalf+1)/2; double *C22 = C21 + nhalf*nhalf; double *A11 = A; double *A21 = A+nhalf*nhalf; double *A12 = A21+nhalf*nhalf; double *A22 = A12+nhalf*nhalf; // these can be made independent with the use of some intermediates, and some final additions syrk( C11, A11, nhalf ); syrk( C11, A12, nhalf ); mult( C21, A21, A11, nhalf ); // This will do C21 = C21-A21*A11^t mult( C21, A22, A12, nhalf ); syrk( C22, A21, nhalf ); syrk( C22, A22, nhalf ); }
void MTLmarks::DmatDmatRun(std::string benchmark) { if(benchmark == "dmatdmatadd"){ mtl_result = dmatdmatadd(size, steps); } else if(benchmark == "dmatdmatmult"){ mtl_result = dmatdmatmult(size, steps); } else if(benchmark == "cmajordmdmmult"){ mtl_result = cmajordmdmmult(size, steps); } else if(benchmark == "rmajordmdmmult"){ mtl_result = rmajordmdmmult(size, steps); } else if(benchmark == "nestedprod"){ mtl_result = nestedprod(size, steps); } else if(benchmark == "symm1"){ mtl_result = symm1(size, steps); } else if(benchmark == "symm1rect"){ mtl_result = symm1rect(size, steps); } else if(benchmark == "symm2"){ mtl_result = symm2(size, steps); } else if(benchmark == "syr2k"){ mtl_result = syr2k(size, steps); } else if(benchmark == "syr2krect"){ mtl_result = syr2krect(size, steps); } else if(benchmark == "syrk"){ mtl_result = syrk(size, steps); } else if(benchmark == "syrkrect"){ mtl_result = syrkrect(size, steps); } else if(benchmark == "custom"){ mtl_result = custom(size, steps); } else{ std::cerr << "MTLmarks benchmark does not exist." << std::endl; exit(1); } }
void bi::condition(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2, M2>& p2, const M3 C, const V3 x2, ExpGaussianPdf<V4, M4>& p3) { /* pre-condition */ BI_ASSERT(x2.size() == p2.size()); BI_ASSERT(p3.size() == p1.size()); BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size()); typename sim_temp_vector<V1>::type z2(p2.size()); typename sim_temp_matrix<M1>::type K(p1.size(), p2.size()); /** * Compute gain matrix: * * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f] */ symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U'); /** * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}', * \Sigma')\f$, where: * * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + \mathcal{K}(\mathbf{x}_2 - * \boldsymbol{\mu}_2)\,,\f] */ z2 = x2; log_vector(z2, p2.getLogs()); axpy(-1.0, p2.mean(), z2); p3.mean() = p1.mean(); gemv(1.0, K, z2, 1.0, p3.mean()); /** * and: * * \f{eqnarray*} * \Sigma' &=& \Sigma_1 - \mathcal{K}C_{\mathbf{x}_1,\mathbf{x}_2}^T \\ * &=& \Sigma_1 - C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1} * C_{\mathbf{x}_1,\mathbf{x}_2}^T\,.\f} */ K = C; trsm(1.0, p2.std(), K, 'R', 'U'); p3.cov() = p1.cov(); syrk(-1.0, K, 1.0, p3.cov(), 'U'); /* update log-variables and precalculations */ p3.setLogs(p1.getLogs()); p3.init(); }
void bi::cov(const M1 X, const V1 w, const V2 mu, M2 Sigma) { /* pre-conditions */ BI_ASSERT(X.size2() == mu.size()); BI_ASSERT(X.size1() == w.size()); BI_ASSERT(Sigma.size1() == mu.size() && Sigma.size2() == mu.size()); typedef typename V1::value_type T; typename sim_temp_matrix<M2>::type Y(X.size1(), X.size2()); typename sim_temp_matrix<M2>::type Z(X.size1(), X.size2()); typename sim_temp_vector<V2>::type v(w.size()); T Wt = sum_reduce(w); Y = X; sub_rows(Y, mu); sqrt_elements(w, v); gdmm(1.0, v, Y, 0.0, Z); syrk(1.0/Wt, Z, 0.0, Sigma, 'U', 'T'); // alternative weight: 1.0/(Wt - W2t/Wt) }
int main(int argc, char* argv[]) //int main(void) { double t_start, t_end; DATA_TYPE* A; DATA_TYPE* C; DATA_TYPE* C_outputFromGpu; if(argc==2){ printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]); cpu_offset = atoi(argv[1]); } A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); C_outputFromGpu = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE)); init_arrays(A, C); read_cl_file(); cl_initialization_fusion(); //cl_initialization(); cl_mem_init(A, C); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue[0], c_mem_obj, CL_TRUE, 0, M * N * sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); t_start = rtclock(); syrk(A, C); t_end = rtclock(); fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start); compareResults(C, C_outputFromGpu); cl_clean_up(); free(A); free(C); free(C_outputFromGpu); return 0; }
// should add alpha=0 optimization to this function void syrkWasteX( double *C, double *A, int n, int x, double alpha ) { int nOldSq = getSizeSq( 0, x ); int nOldTri = getSizeTri( 0, x ); int rrank = getRelativeRank(x,1); double *nC, *nA; if( rrank == 0 ) { nC = (double*) malloc( x*nOldTri*sizeof(double) ); nA = (double*) malloc( x*nOldSq*sizeof(double) ); } startTimer(TIMER_COMM_SYRK); int sizesT[x], sizesS[x]; sizesS[0] = nOldSq; sizesT[0] = nOldTri; for( int i = 1; i < x; i++ ) sizesS[i] = 0, sizesT[i] = 0; double *C1[x]; for( int i = 0; i < x; i++ ) C1[i] = C; reduceBy( x, x, C1, nC, sizesT ); double *A1[x]; for( int i = 0; i < x; i++ ) A1[i] = A; reduceBy( x, x, A1, nA, sizesS ); stopTimer(TIMER_COMM_SYRK); if( rrank == 0 ) syrk( nC, nA, n, 1, 0, alpha ); startTimer(TIMER_COMM_SYRK); expandBy( x, x, C1, nC, sizesT ); stopTimer(TIMER_COMM_SYRK); if( rrank == 0 ) { free( nC ); free( nA ); } }
void syrkBFS4( double *C, double *A, int n, int x, int r, double alpha ) { int nhalf = n/2; int xNew = x/4; int rrank = getRelativeRank(x,xNew); int nOldTri = getSizeTri(r-1,x); int nOldSq = getSizeSq(r-1,x); double *C11 = C; double *C21 = C + nOldTri; double *C22 = C21 + nOldSq; double *A11 = A; double *A21 = A+nOldSq; double *A12 = A21+nOldSq; double *A22 = A12+nOldSq; int CSizes[] = {nOldTri,nOldSq,nOldSq,nOldTri}; int ASizes[] = {nOldSq,nOldSq,nOldSq,nOldSq}; double *A1[] = {A11,A21,A22,A21}; double *A2[] = {A12,A11,A12,A22}; double *lA1 = (double*) malloc( 4*nOldSq*sizeof(double) ); double *lA2 = (double*) malloc( 4*nOldSq*sizeof(double) ); double *lC = (double*) malloc( 4*CSizes[rrank]*sizeof(double) ); startTimer(TIMER_COMM_SYRK); reduceBy( 4, x, A1, lA1, ASizes ); reduceBy( 4, x, A2, lA2, ASizes ); stopTimer(TIMER_COMM_SYRK); if( rrank == 0 || rrank == 3 ) { syrk( lC, lA1, nhalf, xNew, r-1, 0. ); syrk( lC, lA2, nhalf, xNew, r-1, 1. ); } else { mult( lC, lA1, lA2, nhalf, xNew, r-1, 0. ); } double *expC11, *expC21, *expC22; if( alpha == 0 ) { expC11 = C11; expC21 = C21; expC22 = C22; } else { expC11 = (double*) malloc( nOldTri*sizeof(double) ); expC21 = (double*) malloc( nOldSq*sizeof(double) ); expC22 = (double*) malloc( nOldTri*sizeof(double) ); } double *cC21 = (double*) malloc( nOldSq*sizeof(double) ); double *C1[] = {expC11,expC21,cC21,expC22}; expandBy( 4, x, C1, lC, CSizes ); int ione = 1; double done = 1.; if( alpha != 0 ) { // actually, this only works for alpha = 1 daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione ); daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione ); daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione ); free(expC11); free(expC22); free(expC21); } daxpy_( &nOldSq, &done, cC21, &ione, C21, &ione ); free(lA1); free(lA2); free(lC); free(cC21); }
T& Linalg<T, H>::syrk( const T &a, T &c, const value_type &alpha, const value_type &beta, Uplo uplo) { return const_cast< T& >(syrk( a, const_cast< const T& >(c), alpha, beta, uplo)); }
void syrkBFS8( double *C, double *A, int n, int x, int r, double alpha ) { int nhalf = n/2; int xNew = x/4; int xNewer = x/8; int rrank = getRelativeRank(x,xNew); int rrank2 = getRelativeRank(xNew,xNewer); int nOldTri = getSizeTri(r-1,x); int nOldSq = getSizeSq(r-1,x); double *C11 = C; double *C21 = C + nOldTri; double *C22 = C21 + nOldSq; double *A11 = A; double *A21 = A+nOldSq; double *A12 = A21+nOldSq; double *A22 = A12+nOldSq; // first do the 4-way re-arrangement. int nCSize; if( rrank == 0 || rrank == 3 ) nCSize = 4*nOldTri; else nCSize = 4*nOldSq; double *C21c = (double*) malloc( nOldSq*sizeof(double) ); //int Csizes[] = {nOldTri,nOldSq,0,nOldTri}; int Csizes2[] = {nOldTri,nOldSq,nOldSq,nOldTri}; double *nC = (double*) malloc( 4*Csizes2[rrank]*sizeof(double) ); //startTimer(TIMER_COMM_SYRK); //reduceBy( 4, x, C1, nC, Csizes ); //stopTimer(TIMER_COMM_SYRK); double *A1[] = {A11,A21,A22,A22}; double *A2[] = {A12,A11,A12,A21}; int Asizes[] = {nOldSq,nOldSq,nOldSq,nOldSq}; double *nA1 = (double*) malloc( 4*nOldSq*sizeof(double) ); double *nA2 = (double*) malloc( 4*nOldSq*sizeof(double) ); startTimer(TIMER_COMM_SYRK); reduceBy( 4, x, A1, nA1, Asizes ); reduceBy( 4, x, A2, nA2, Asizes ); stopTimer(TIMER_COMM_SYRK); if( rrank == 1 || rrank == 2 ) { // these two do the calls to mult mult( nC, nA1, nA2, nhalf, xNew, r-1, 0. ); } else { // these two will do the recursive syrk calls. First, we need to split them up further double *nCcopy = (double*) malloc( 4*nOldTri*sizeof(double) ); double *nnC = (double*) malloc( 8*nOldTri*sizeof(double) ); double *nnA = (double*) malloc( 8*nOldSq*sizeof(double) ); double *nnA1[] = {nA1,nA2}; int nAsizes[] = {4*nOldSq,4*nOldSq}; startTimer(TIMER_COMM_SYRK); reduceBy( 2, xNew, nnA1, nnA, nAsizes ); stopTimer(TIMER_COMM_SYRK); double *nnC1[] = {nC,nCcopy}; int nCsizes2[] = {4*nOldTri,4*nOldTri}; startTimer(TIMER_COMM_SYRK); //reduceBy( 2, xNew, nnC1, nnC, nCsizes ); stopTimer(TIMER_COMM_SYRK); syrk( nnC, nnA, nhalf, xNewer, r-1, 0. ); startTimer(TIMER_COMM_SYRK); expandBy( 2, xNew, nnC1, nnC, nCsizes2 ); stopTimer(TIMER_COMM_SYRK); // final additions int ione = 1; double done = 1.; int s = 4*nOldTri; daxpy_( &s, &done, nCcopy, &ione, nC, &ione ); free(nCcopy); free(nnC); free(nnA); } free(nA1); free(nA2); // recollect the answers, final additions double *expC11, *expC21, *expC22; if( alpha == 0. ) { expC11 = C11; expC21 = C21; expC22 = C22; } else { expC11 = (double*) malloc( nOldTri*sizeof(double) ); expC21 = (double*) malloc( nOldSq*sizeof(double) ); expC22 = (double*) malloc( nOldTri*sizeof(double) ); } double *C1[] = {expC11, expC21, C21c, expC22}; startTimer(TIMER_COMM_SYRK); expandBy( 4, x, C1, nC, Csizes2 ); stopTimer(TIMER_COMM_SYRK); free(nC); int ione = 1; double done = 1.; if( alpha != 0 ) { // only correct for alpha=1 daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione ); daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione ); daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione ); free(expC11); free(expC21); free(expC22); } daxpy_( &nOldSq, &done, C21c, &ione, C21, &ione ); free(C21c); }
T Linalg<T, H>::syrk( const T &a, const value_type &alpha, const value_type &beta, Uplo uplo) { T c(a.allocator()); syrk(a, &c, alpha, beta, uplo); return c; }
void syrkBFS2( double *C, double *A, int n, int x, int r, double alpha ) { int nhalf = n/2; int xNew = x/2; int rrank = getRelativeRank(x,xNew); int nOldTri = getSizeTri(r-1,x); int nOldSq = getSizeSq(r-1,x); double *C11 = C; double *C21 = C + nOldTri; double *C22 = C21 + nOldSq; double *A11 = A; double *A21 = A+nOldSq; double *A12 = A21+nOldSq; double *A22 = A12+nOldSq; double *C21c = (double*) malloc( nOldSq*sizeof(double) ); double *nC1 = (double*) malloc( 2*nOldTri*sizeof(double) ); double *nC2 = (double*) malloc( 2*nOldSq*sizeof(double) ); int C1sizes[] = {nOldTri,nOldTri}; int C2sizes[] = {nOldSq,0}; int C2sizes2[] = {nOldSq,nOldSq}; double *A1[] = {A11,A22}; double *A2[] = {A12,A21}; double *A3[] = {A21,A12}; double *nA1 = (double*) malloc( 2*nOldSq*sizeof(double) ); double *nA2 = (double*) malloc( 2*nOldSq*sizeof(double) ); double *nA3 = (double*) malloc( 2*nOldSq*sizeof(double) ); startTimer(TIMER_COMM_SYRK); MPI_Request *req; double *buf; reduceBy( 2, x, A1, nA1, C2sizes2 ); iReduceBy1( 2, x, A2, nA2, C2sizes2, req, buf ); stopTimer(TIMER_COMM_SYRK); syrk( nC1, nA1, nhalf, xNew, r-1, 0. ); startTimer(TIMER_COMM_SYRK); iReduceBy2( 2, x, A2, nA2, C2sizes2, req, buf ); iReduceBy1( 2, x, A3, nA3, C2sizes2, req, buf ); stopTimer(TIMER_COMM_SYRK); syrk( nC1, nA2, nhalf, xNew, r-1, 1. ); double *expC11, *expC21, *expC22; if( alpha == 0 ) { expC11 = C11; expC21 = C21; expC22 = C22; } else { expC11 = (double*) malloc( nOldTri*sizeof(double) ); expC21 = (double*) malloc( nOldSq*sizeof(double) ); expC22 = (double*) malloc( nOldTri*sizeof(double) ); } double *C1[] = {expC11,expC22}; double *C2[] = {expC21,C21c}; startTimer(TIMER_COMM_SYRK); iReduceBy2( 2, x, A3, nA3, C2sizes2, req, buf ); iExpandBy1( 2, x, C1, nC1, C1sizes, req, buf ); stopTimer(TIMER_COMM_SYRK); if( rrank == 0 ) mult( nC2, nA3, nA1, nhalf, xNew, r-1, 0. ); else mult( nC2, nA1, nA3, nhalf, xNew, r-1, 0. ); startTimer(TIMER_COMM_SYRK); iExpandBy2( 2, x, C1, nC1, C1sizes, req, buf ); iExpandBy1( 2, x, C2, nC2, C2sizes2, req, buf ); stopTimer(TIMER_COMM_SYRK); int ione = 1; double done = 1.; if( alpha != 0 ) { // actually, this only works for alpha = 1 daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione ); daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione ); free(expC11); free(expC22); } startTimer(TIMER_COMM_SYRK); iExpandBy2( 2, x, C2, nC2, C2sizes2, req, buf ); stopTimer(TIMER_COMM_SYRK); if( alpha != 0 ) { // actually, this only works for alpha = 1 daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione ); free(expC21); } daxpy_( &nOldSq, &done, C21c, &ione, C21, &ione ); free(C21c); free(nC1); free(nC2); free(nA1); free(nA2); free(nA3); }