コード例 #1
0
ファイル: misc.hpp プロジェクト: milthorpe/LibBi
void bi::marginalise(const ExpGaussianPdf<V1, M1>& p1,
    const ExpGaussianPdf<V2,M2>& p2, const M3 C,
    const ExpGaussianPdf<V4, M4>& q2, ExpGaussianPdf<V5,M5>& p3) {
  /* pre-conditions */
  BI_ASSERT(q2.size() == p2.size());
  BI_ASSERT(p3.size() == p1.size());
  BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size());

  typename sim_temp_vector<V1>::type z2(p2.size());
  typename sim_temp_matrix<M1>::type K(p1.size(), p2.size());
  typename sim_temp_matrix<M1>::type A1(p2.size(), p2.size());
  typename sim_temp_matrix<M1>::type A2(p2.size(), p2.size());

  /**
   * Compute gain matrix:
   *
   * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f]
   */
  symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U');

  /**
   * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}',
   * \Sigma')\f$, where:
   *
   * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 +
   * \mathcal{K}(\boldsymbol{\mu}_3 - \boldsymbol{\mu}_2)\,,\f]
   */
  z2 = q2.mean();
  axpy(-1.0, p2.mean(), z2);
  p3.mean() = p1.mean();
  gemv(1.0, K, z2, 1.0, p3.mean());

  /**
   * and:
   *
   * \f{eqnarray*}
   * \Sigma' &=& \Sigma_1 + \mathcal{K}(\Sigma_3 -
   * \Sigma_2)\mathcal{K}^T \\
   * &=& \Sigma_1 + \mathcal{K}\Sigma_3\mathcal{K}^T -
   * \mathcal{K}\Sigma_2\mathcal{K}^T\,.
   * \f}
   */
  p3.cov() = p1.cov();

  A1 = K;
  trmm(1.0, q2.std(), A1, 'R', 'U', 'T');
  syrk(1.0, A1, 1.0, p3.cov(), 'U');

  A2 = K;
  trmm(1.0, p2.std(), A2, 'R', 'U', 'T');
  syrk(-1.0, A2, 1.0, p3.cov(), 'U');

  /* make sure correct log-variables set */
  p3.setLogs(p2.getLogs());
  p3.init(); // redo precalculations
}
コード例 #2
0
ファイル: product_syrk.cpp プロジェクト: 151706061/ParaView
void test_product_syrk()
{
  for(int i = 0; i < g_repeat ; i++)
  {
    int s;
    s = internal::random<int>(1,320);
    CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
    s = internal::random<int>(1,320);
    CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
    s = internal::random<int>(1,200);
    CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
    s = internal::random<int>(1,200);
    CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
  }
}
コード例 #3
0
ファイル: syrk.c プロジェクト: rcfsousa/Polybench_OpenMP
int main() {
  double t_start, t_end;

  DATA_TYPE* A;
  DATA_TYPE* C;
  DATA_TYPE* D;

  A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
  C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
  D = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));

  fprintf(stdout, "<< Symmetric rank-k operations >>\n");

  init_arrays(A, C, D);	
  syrkGPU(A, D);

  t_start = rtclock();
  syrk(A, C);
  t_end = rtclock();
  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);

  compareResults(C, D);

  free(A);
  free(C);
  free(D);
  return 0;
}
コード例 #4
0
ファイル: sequential.cpp プロジェクト: benjamingr/CAPS
// assume that A, which is triangular, is stored in recursive L; that means that the square block is stored in recursive backwards N
void chol( double *A, int n ) {
  // base case
  if( n <= nmin ) {
    // probably we want to copy into full, since there doesn't seem to be a blocked packed cholesky in lapack; but the easy version for now
    int info = 0;
    //char L = 'L';
    //dpotrf_( &L, &size, Afull, &size, &info);
    //dpptrf_( &L, &n, A, &info);
    //A[0] = sqrt(A[0]);
    // this uses the unpacked, but blocked version.
    double *temp = (double*) malloc( n*n*sizeof(double) );
    double *Ap = A;
    for( int c = 0; c < n; c++ )
      for( int r = c; r < n; r++ )
	temp[c*n+r] = *(Ap++);
    char L = 'L', N = 'N';
    double none = -1., one = 1.;
    dpotrf_( &L, &n, temp, &n, &info);
    Ap = A;
    for( int c = 0; c < n; c++ )
      for( int r = c; r < n; r++ )
	*(Ap++) = temp[c*n+r];
    free(temp);
    return;
  }
  int nhalf = n/2;
  double *A11 = A;
  double *A21 = A+nhalf*(nhalf+1)/2;
  double *A22 = A21+nhalf*nhalf;
  chol(A11,nhalf);
  trsm(A21,A11,nhalf);
  syrk(A22,A21,nhalf);
  chol(A22,nhalf);
}
コード例 #5
0
ファイル: product_syrk.cpp プロジェクト: 13221325403/openbr
void test_product_syrk()
{
  for(int i = 0; i < g_repeat ; i++)
  {
    int s;
    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE);
    CALL_SUBTEST_1( syrk(MatrixXf(s, s)) );
    CALL_SUBTEST_2( syrk(MatrixXd(s, s)) );
    TEST_SET_BUT_UNUSED_VARIABLE(s)
    
    s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/2);
    CALL_SUBTEST_3( syrk(MatrixXcf(s, s)) );
    CALL_SUBTEST_4( syrk(MatrixXcd(s, s)) );
    TEST_SET_BUT_UNUSED_VARIABLE(s)
  }
}
コード例 #6
0
ファイル: syrk.cpp プロジェクト: benjamingr/CAPS
void syrkDFS( double *C, double *A, int n, int x, int r, double alpha ) {
  int nhalf = n/2;
  int nOldTri = getSizeTri(r-1,x);
  int nOldSq = getSizeSq(r-1,x);
  double *C11 = C;
  double *C21 = C + nOldTri;
  double *C22 = C21 + nOldSq;
  double *A11 = A;
  double *A21 = A+nOldSq;
  double *A12 = A21+nOldSq;
  double *A22 = A12+nOldSq;
  
  syrk( C11, A11, nhalf, x, r-1, alpha );
  syrk( C11, A12, nhalf, x, r-1, 1. );
  mult( C21, A21, A11, nhalf, x, r-1, alpha );
  mult( C21, A22, A12, nhalf, x, r-1, 1. );
  syrk( C22, A21, nhalf, x, r-1, alpha );
  syrk( C22, A22, nhalf, x, r-1, 1. );
}
コード例 #7
0
ファイル: misc.hpp プロジェクト: milthorpe/LibBi
void bi::cov(const M1 X, const V1 mu, M2 Sigma) {
  /* pre-conditions */
  BI_ASSERT(X.size2() == mu.size());
  BI_ASSERT(Sigma.size1() == mu.size() && Sigma.size2() == mu.size());

  const int N = X.size1();
  typename sim_temp_matrix<M2>::type Y(X.size1(), X.size2());
  Y = X;
  sub_rows(Y, mu);
  syrk(1.0/(N - 1.0), Y, 0.0, Sigma, 'U', 'T');
}
コード例 #8
0
ファイル: bench-chol.cpp プロジェクト: benjamingr/CAPS
int main( int argc, char **argv ) {

  initCommunication( &argc, &argv );
  
  // make up a simple test
  int size = read_int( argc, argv, "-s", 8 );
  int r = read_int( argc, argv, "-r", 2 );
  int P;
  MPI_Comm_size( MPI_COMM_WORLD, &P );
  initSizes( P, r, size );
  if( getRank() == 0 ) {
    if( P > (1<<r) )
      printf("Need more recursive steps for this many processors\n");
    if( P > (size/(1<<r))*(size/(1<<r)+1)/2)
      printf("Need a bigger matrix/fewer recursive steps for this many processors\n");
    printf("-s %d -r %d -n %d\n", size, r, P);
  }
  int sizeSq = getSizeSq(r,P);
  int sizeTri = getSizeTri(r,P);
  double *X = (double*) malloc( sizeSq*sizeof(double) );
  srand48(getRank());
  fill(X,sizeSq);
  double *A = (double*) malloc( sizeTri*sizeof(double) );
  if( getRank() == 0 )
    printf("Generating a symmetric positive definite test matrix\n");
  initTimers();
  MPI_Barrier( MPI_COMM_WORLD );
  double st2 = read_timer();
  syrk( A, X, size, P, r, 0. );
  MPI_Barrier( MPI_COMM_WORLD );
  double et2 = read_timer();
  if( getRank() == 0 )
    printf("Generation time: %f\n", et2-st2);
  initTimers();
  free(X);
  for( int i = 0; i < sizeTri; i++ )
    A[i] = -A[i];

  if( getRank() == 0 )
    printf("Starting benchmark\n");
  MPI_Barrier( MPI_COMM_WORLD );
  double startTime = read_timer();
  chol( A, size, P, r );
  MPI_Barrier( MPI_COMM_WORLD );
  double endTime = read_timer();
  
  if( getRank() == 0 )
    printf("Time: %f Gflop/s %f\n", endTime-startTime, size*1.*size*size/3./(endTime-startTime)/1.e9);

  free(A);
  printCounters(size);
  MPI_Finalize();
}
コード例 #9
0
ファイル: syrk_m.c プロジェクト: rcfsousa/Polybench_OpenMP
int main() {
  double t_start, t_end;

  init_arrays();	
  syrkGPU();
  t_start = rtclock();
  syrk();
  t_end = rtclock();
  fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);
  compareResults();
  return 0;
}
コード例 #10
0
ファイル: sequential.cpp プロジェクト: benjamingr/CAPS
// computes C -= A*A^t, where C is symmetric, half stored, A is general
void syrk( double *C, double *A, int n ) {
  // base case
  if( n <= nmin ) {
    double *temp = (double*) malloc( n*n*sizeof(double) );
    double *Cp = C;
    for( int c = 0; c < n; c++ )
      for( int r = c; r < n; r++ )
	temp[c*n+r] = *(Cp++);
    char L = 'L', N = 'N';
    double none = -1., one = 1.;
    dsyrk_(&L, &N, &n, &n, &none, A, &n, &one, temp, &n);
    Cp = C;
    for( int c = 0; c < n; c++ )
      for( int r = c; r < n; r++ )
	*(Cp++) = temp[c*n+r];
    free(temp);
    //C[0] -= A[0]*A[0];
    return;
  }
  int nhalf = n/2;
  double *C11 = C;
  double *C21 = C + nhalf*(nhalf+1)/2;
  double *C22 = C21 + nhalf*nhalf;
  double *A11 = A;
  double *A21 = A+nhalf*nhalf;
  double *A12 = A21+nhalf*nhalf;
  double *A22 = A12+nhalf*nhalf;
  
  // these can be made independent with the use of some intermediates, and some final additions
  syrk( C11, A11, nhalf );
  syrk( C11, A12, nhalf );
  mult( C21, A21, A11, nhalf ); // This will do C21 = C21-A21*A11^t
  mult( C21, A22, A12, nhalf );
  syrk( C22, A21, nhalf );
  syrk( C22, A22, nhalf );
}
コード例 #11
0
ファイル: mtlmarks.hpp プロジェクト: BoostGSoC14/boost.ublas
void MTLmarks::DmatDmatRun(std::string benchmark) {
    
    if(benchmark == "dmatdmatadd"){
        mtl_result = dmatdmatadd(size, steps);
    }
    else if(benchmark == "dmatdmatmult"){
        mtl_result = dmatdmatmult(size, steps);
    }
    else if(benchmark == "cmajordmdmmult"){
        mtl_result = cmajordmdmmult(size, steps);
    }
    else if(benchmark == "rmajordmdmmult"){
        mtl_result = rmajordmdmmult(size, steps);
    }
    else if(benchmark == "nestedprod"){
        mtl_result = nestedprod(size, steps);
    }
    else if(benchmark == "symm1"){
        mtl_result = symm1(size, steps);
    }
    else if(benchmark == "symm1rect"){
        mtl_result = symm1rect(size, steps);
    }
    else if(benchmark == "symm2"){
        mtl_result = symm2(size, steps);
    }
    else if(benchmark == "syr2k"){
        mtl_result = syr2k(size, steps);
    }
    else if(benchmark == "syr2krect"){
        mtl_result = syr2krect(size, steps);
    }
    else if(benchmark == "syrk"){
        mtl_result = syrk(size, steps);
    }
    else if(benchmark == "syrkrect"){
        mtl_result = syrkrect(size, steps);
    }
    else if(benchmark == "custom"){
        mtl_result = custom(size, steps);
    }
    else{
        std::cerr << "MTLmarks benchmark does not exist." << std::endl;
        exit(1);
    }
    
}
コード例 #12
0
ファイル: misc.hpp プロジェクト: milthorpe/LibBi
void bi::condition(const ExpGaussianPdf<V1, M1>& p1, const ExpGaussianPdf<V2,
    M2>& p2, const M3 C, const V3 x2, ExpGaussianPdf<V4, M4>& p3) {
  /* pre-condition */
  BI_ASSERT(x2.size() == p2.size());
  BI_ASSERT(p3.size() == p1.size());
  BI_ASSERT(C.size1() == p1.size() && C.size2() == p2.size());

  typename sim_temp_vector<V1>::type z2(p2.size());
  typename sim_temp_matrix<M1>::type K(p1.size(), p2.size());

  /**
   * Compute gain matrix:
   *
   * \f[\mathcal{K} = C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}\,.\f]
   */
  symm(1.0, p2.prec(), C, 0.0, K, 'R', 'U');

  /**
   * Then result is given by \f$\mathcal{N}(\boldsymbol{\mu}',
   * \Sigma')\f$, where:
   *
   * \f[\boldsymbol{\mu}' = \boldsymbol{\mu}_1 + \mathcal{K}(\mathbf{x}_2 -
   * \boldsymbol{\mu}_2)\,,\f]
   */
  z2 = x2;
  log_vector(z2, p2.getLogs());
  axpy(-1.0, p2.mean(), z2);
  p3.mean() = p1.mean();
  gemv(1.0, K, z2, 1.0, p3.mean());

  /**
   * and:
   *
   * \f{eqnarray*}
   * \Sigma' &=& \Sigma_1 - \mathcal{K}C_{\mathbf{x}_1,\mathbf{x}_2}^T \\
   * &=& \Sigma_1 - C_{\mathbf{x}_1,\mathbf{x}_2}\Sigma_2^{-1}
   * C_{\mathbf{x}_1,\mathbf{x}_2}^T\,.\f}
   */
  K = C;
  trsm(1.0, p2.std(), K, 'R', 'U');
  p3.cov() = p1.cov();
  syrk(-1.0, K, 1.0, p3.cov(), 'U');

  /* update log-variables and precalculations */
  p3.setLogs(p1.getLogs());
  p3.init();
}
コード例 #13
0
ファイル: misc.hpp プロジェクト: milthorpe/LibBi
void bi::cov(const M1 X, const V1 w, const V2 mu, M2 Sigma) {
  /* pre-conditions */
  BI_ASSERT(X.size2() == mu.size());
  BI_ASSERT(X.size1() == w.size());
  BI_ASSERT(Sigma.size1() == mu.size() && Sigma.size2() == mu.size());

  typedef typename V1::value_type T;
  typename sim_temp_matrix<M2>::type Y(X.size1(), X.size2());
  typename sim_temp_matrix<M2>::type Z(X.size1(), X.size2());
  typename sim_temp_vector<V2>::type v(w.size());

  T Wt = sum_reduce(w);
  Y = X;
  sub_rows(Y, mu);
  sqrt_elements(w, v);
  gdmm(1.0, v, Y, 0.0, Z);
  syrk(1.0/Wt, Z, 0.0, Sigma, 'U', 'T');
  // alternative weight: 1.0/(Wt - W2t/Wt)
}
コード例 #14
0
ファイル: syrk.c プロジェクト: zhangfengthu/CoRunBench
int main(int argc, char* argv[]) 
//int main(void) 
{
	double t_start, t_end;

	DATA_TYPE* A;
	DATA_TYPE* C;
	DATA_TYPE* C_outputFromGpu;
        if(argc==2){
          printf("arg 1 = %s\narg 2 = %s\n", argv[0], argv[1]);
          cpu_offset = atoi(argv[1]);
        }


	A = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));
	C_outputFromGpu = (DATA_TYPE*)malloc(N*M*sizeof(DATA_TYPE));

	init_arrays(A, C);
	read_cl_file();
	cl_initialization_fusion();
	//cl_initialization();
	cl_mem_init(A, C);
	cl_load_prog();

	cl_launch_kernel();

	errcode = clEnqueueReadBuffer(clCommandQue[0], c_mem_obj, CL_TRUE, 0, M * N * sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL);
	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");  

	t_start = rtclock();
	syrk(A, C);
	t_end = rtclock(); 
	fprintf(stdout, "CPU Runtime: %0.6lfs\n", t_end - t_start);   
	compareResults(C, C_outputFromGpu);
	cl_clean_up();
	
	free(A);
	free(C);
	free(C_outputFromGpu);

	return 0;
}
コード例 #15
0
ファイル: syrk.cpp プロジェクト: benjamingr/CAPS
//  should add alpha=0 optimization to this function
void syrkWasteX( double *C, double *A, int n, int x, double alpha ) {
  int nOldSq = getSizeSq( 0, x );
  int nOldTri = getSizeTri( 0, x );
  int rrank = getRelativeRank(x,1);
  double *nC, *nA;
  if( rrank == 0 ) {
    nC = (double*) malloc( x*nOldTri*sizeof(double) );
    nA = (double*) malloc( x*nOldSq*sizeof(double) );
  }

  startTimer(TIMER_COMM_SYRK);
  int sizesT[x], sizesS[x];
  sizesS[0] = nOldSq;
  sizesT[0] = nOldTri;
  for( int i = 1; i < x; i++ )
    sizesS[i] = 0, sizesT[i] = 0;
  
  double *C1[x];
  for( int i = 0; i < x; i++ )
    C1[i] = C;
  reduceBy( x, x, C1, nC, sizesT );

  double *A1[x];
  for( int i = 0; i < x; i++ )
    A1[i] = A;
  reduceBy( x, x, A1, nA, sizesS );

  stopTimer(TIMER_COMM_SYRK);

  if( rrank == 0 )
    syrk( nC, nA, n, 1, 0, alpha );

  startTimer(TIMER_COMM_SYRK);
  expandBy( x, x, C1, nC, sizesT );
  stopTimer(TIMER_COMM_SYRK);

  if( rrank == 0 ) {
    free( nC );
    free( nA );
  }
}
コード例 #16
0
ファイル: syrk.cpp プロジェクト: benjamingr/CAPS
void syrkBFS4( double *C, double *A, int n, int x, int r, double alpha ) {
  int nhalf = n/2;
  int xNew = x/4;
  int rrank = getRelativeRank(x,xNew);

  int nOldTri = getSizeTri(r-1,x);
  int nOldSq = getSizeSq(r-1,x);
  double *C11 = C;
  double *C21 = C + nOldTri;
  double *C22 = C21 + nOldSq;
  double *A11 = A;
  double *A21 = A+nOldSq;
  double *A12 = A21+nOldSq;
  double *A22 = A12+nOldSq;

  int CSizes[] = {nOldTri,nOldSq,nOldSq,nOldTri};
  int ASizes[] = {nOldSq,nOldSq,nOldSq,nOldSq};
  double *A1[] = {A11,A21,A22,A21};
  double *A2[] = {A12,A11,A12,A22};
  double *lA1 = (double*) malloc( 4*nOldSq*sizeof(double) );
  double *lA2 = (double*) malloc( 4*nOldSq*sizeof(double) );
  double *lC = (double*) malloc( 4*CSizes[rrank]*sizeof(double) );

  startTimer(TIMER_COMM_SYRK);
  reduceBy( 4, x, A1, lA1, ASizes );
  reduceBy( 4, x, A2, lA2, ASizes );
  stopTimer(TIMER_COMM_SYRK);

  if( rrank == 0 || rrank == 3 ) {
    syrk( lC, lA1, nhalf, xNew, r-1, 0. );
    syrk( lC, lA2, nhalf, xNew, r-1, 1. );
  } else {
    mult( lC, lA1, lA2, nhalf, xNew, r-1, 0. );
  }

  double *expC11, *expC21, *expC22;
  if( alpha == 0 ) {
    expC11 = C11;
    expC21 = C21;
    expC22 = C22;
  } else {
    expC11 = (double*) malloc( nOldTri*sizeof(double) );
    expC21 = (double*) malloc( nOldSq*sizeof(double) );
    expC22 = (double*) malloc( nOldTri*sizeof(double) );
  }
  double *cC21 = (double*) malloc( nOldSq*sizeof(double) );
  double *C1[] = {expC11,expC21,cC21,expC22};

  expandBy( 4, x, C1, lC, CSizes );

  int ione = 1;
  double done = 1.;
  if( alpha != 0 ) { // actually, this only works for alpha = 1
    daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione );
    daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione );
    daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione );
    free(expC11);
    free(expC22);
    free(expC21);
  }

  daxpy_( &nOldSq, &done, cC21, &ione, C21, &ione );
  free(lA1);
  free(lA2);
  free(lC);
  free(cC21);
}
コード例 #17
0
T& Linalg<T, H>::syrk(
    const T &a, T &c, const value_type &alpha, const value_type &beta,
    Uplo uplo) {
  return const_cast< T& >(syrk(
      a, const_cast< const T& >(c), alpha, beta, uplo));
}
コード例 #18
0
ファイル: syrk.cpp プロジェクト: benjamingr/CAPS
void syrkBFS8( double *C, double *A, int n, int x, int r, double alpha ) {
  int nhalf = n/2;
  int xNew = x/4;
  int xNewer = x/8;
  int rrank = getRelativeRank(x,xNew);
  int rrank2 = getRelativeRank(xNew,xNewer);

  int nOldTri = getSizeTri(r-1,x);
  int nOldSq = getSizeSq(r-1,x);
  double *C11 = C;
  double *C21 = C + nOldTri;
  double *C22 = C21 + nOldSq;
  double *A11 = A;
  double *A21 = A+nOldSq;
  double *A12 = A21+nOldSq;
  double *A22 = A12+nOldSq;

  // first do the 4-way re-arrangement.
  int nCSize;
  if( rrank == 0 || rrank == 3 )
    nCSize = 4*nOldTri;
  else
    nCSize = 4*nOldSq;
  double *C21c = (double*) malloc( nOldSq*sizeof(double) );
  //int Csizes[] = {nOldTri,nOldSq,0,nOldTri};
  int Csizes2[] = {nOldTri,nOldSq,nOldSq,nOldTri};
  double *nC = (double*) malloc( 4*Csizes2[rrank]*sizeof(double) );
  //startTimer(TIMER_COMM_SYRK);
  //reduceBy( 4, x, C1, nC, Csizes );
  //stopTimer(TIMER_COMM_SYRK);

  double *A1[] = {A11,A21,A22,A22};
  double *A2[] = {A12,A11,A12,A21};
  int Asizes[] = {nOldSq,nOldSq,nOldSq,nOldSq};
  double *nA1 = (double*) malloc( 4*nOldSq*sizeof(double) );
  double *nA2 = (double*) malloc( 4*nOldSq*sizeof(double) );
  startTimer(TIMER_COMM_SYRK);
  reduceBy( 4, x, A1, nA1, Asizes );
  reduceBy( 4, x, A2, nA2, Asizes );
  stopTimer(TIMER_COMM_SYRK);
  if( rrank == 1 || rrank == 2 ) { // these two do the calls to mult
    mult( nC, nA1, nA2, nhalf, xNew, r-1, 0. );
  } else { // these two will do the recursive syrk calls.  First, we need to split them up further
    double *nCcopy = (double*) malloc( 4*nOldTri*sizeof(double) );
    
    double *nnC = (double*) malloc( 8*nOldTri*sizeof(double) );
    double *nnA = (double*) malloc( 8*nOldSq*sizeof(double) );

    double *nnA1[] = {nA1,nA2};
    int nAsizes[] = {4*nOldSq,4*nOldSq};
    startTimer(TIMER_COMM_SYRK);
    reduceBy( 2, xNew, nnA1, nnA, nAsizes );
    stopTimer(TIMER_COMM_SYRK);
    double *nnC1[] = {nC,nCcopy};
    int nCsizes2[] = {4*nOldTri,4*nOldTri};
    startTimer(TIMER_COMM_SYRK);
    //reduceBy( 2, xNew, nnC1, nnC, nCsizes );
    stopTimer(TIMER_COMM_SYRK);
    syrk( nnC, nnA, nhalf, xNewer, r-1, 0. );
 
    startTimer(TIMER_COMM_SYRK);
    expandBy( 2, xNew, nnC1, nnC, nCsizes2 );
    stopTimer(TIMER_COMM_SYRK);
    // final additions
    int ione = 1;
    double done = 1.;
    int s = 4*nOldTri;
    daxpy_( &s, &done, nCcopy, &ione, nC, &ione );
    free(nCcopy);
    free(nnC);
    free(nnA);
  }
  free(nA1);
  free(nA2);
  // recollect the answers, final additions
  double *expC11, *expC21, *expC22;
  if( alpha == 0. ) {
    expC11 = C11;
    expC21 = C21;
    expC22 = C22;
  } else {
    expC11 = (double*) malloc( nOldTri*sizeof(double) );
    expC21 = (double*) malloc( nOldSq*sizeof(double) );
    expC22 = (double*) malloc( nOldTri*sizeof(double) );
  }
  double *C1[] = {expC11, expC21, C21c, expC22};
  startTimer(TIMER_COMM_SYRK);
  expandBy( 4, x, C1, nC, Csizes2 );
  stopTimer(TIMER_COMM_SYRK);
  free(nC);
  int ione = 1;
  double done = 1.;
  if( alpha != 0 ) { // only correct for alpha=1
    daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione );
    daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione );
    daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione );
    free(expC11);
    free(expC21);
    free(expC22);
  }
  daxpy_( &nOldSq, &done, C21c, &ione, C21, &ione );
  free(C21c);
}
コード例 #19
0
T Linalg<T, H>::syrk(
    const T &a, const value_type &alpha, const value_type &beta, Uplo uplo) {
  T c(a.allocator());
  syrk(a, &c, alpha, beta, uplo);
  return c;
}
コード例 #20
0
ファイル: syrk.cpp プロジェクト: benjamingr/CAPS
void syrkBFS2( double *C, double *A, int n, int x, int r, double alpha ) {
  int nhalf = n/2;
  int xNew = x/2;
  int rrank = getRelativeRank(x,xNew);

  int nOldTri = getSizeTri(r-1,x);
  int nOldSq = getSizeSq(r-1,x);
  double *C11 = C;
  double *C21 = C + nOldTri;
  double *C22 = C21 + nOldSq;
  double *A11 = A;
  double *A21 = A+nOldSq;
  double *A12 = A21+nOldSq;
  double *A22 = A12+nOldSq;

  double *C21c = (double*) malloc( nOldSq*sizeof(double) );
  double *nC1 = (double*) malloc( 2*nOldTri*sizeof(double) );
  double *nC2 = (double*) malloc( 2*nOldSq*sizeof(double) );
  int C1sizes[] = {nOldTri,nOldTri};
  int C2sizes[] = {nOldSq,0};
  int C2sizes2[] = {nOldSq,nOldSq};

  double *A1[] = {A11,A22};
  double *A2[] = {A12,A21};
  double *A3[] = {A21,A12};
  double *nA1 = (double*) malloc( 2*nOldSq*sizeof(double) );
  double *nA2 = (double*) malloc( 2*nOldSq*sizeof(double) );
  double *nA3 = (double*) malloc( 2*nOldSq*sizeof(double) );
  startTimer(TIMER_COMM_SYRK);
  MPI_Request *req;
  double *buf;
  reduceBy( 2, x, A1, nA1, C2sizes2 );
  iReduceBy1( 2, x, A2, nA2, C2sizes2, req, buf );
  stopTimer(TIMER_COMM_SYRK);

  syrk( nC1, nA1, nhalf, xNew, r-1, 0. );

  startTimer(TIMER_COMM_SYRK);
  iReduceBy2( 2, x, A2, nA2, C2sizes2, req, buf );
  iReduceBy1( 2, x, A3, nA3, C2sizes2, req, buf );
  stopTimer(TIMER_COMM_SYRK);

  syrk( nC1, nA2, nhalf, xNew, r-1, 1. );

  double *expC11, *expC21, *expC22;
  if( alpha == 0 ) {
    expC11 = C11;
    expC21 = C21;
    expC22 = C22;
  } else {
    expC11 = (double*) malloc( nOldTri*sizeof(double) );
    expC21 = (double*) malloc( nOldSq*sizeof(double) );
    expC22 = (double*) malloc( nOldTri*sizeof(double) );
  }
  double *C1[] = {expC11,expC22};
  double *C2[] = {expC21,C21c};

  startTimer(TIMER_COMM_SYRK);
  iReduceBy2( 2, x, A3, nA3, C2sizes2, req, buf );
  iExpandBy1( 2, x, C1, nC1, C1sizes, req, buf );
  stopTimer(TIMER_COMM_SYRK);

  if( rrank == 0 )
    mult( nC2, nA3, nA1, nhalf, xNew, r-1, 0. );
  else
    mult( nC2, nA1, nA3, nhalf, xNew, r-1, 0. );


  startTimer(TIMER_COMM_SYRK);
  iExpandBy2( 2, x, C1, nC1, C1sizes, req, buf );
  iExpandBy1( 2, x, C2, nC2, C2sizes2, req, buf );
  stopTimer(TIMER_COMM_SYRK);

  int ione = 1;
  double done = 1.;
  if( alpha != 0 ) { // actually, this only works for alpha = 1
    daxpy_( &nOldTri, &done, expC11, &ione, C11, &ione );
    daxpy_( &nOldTri, &done, expC22, &ione, C22, &ione );
    free(expC11);
    free(expC22);
  }

  startTimer(TIMER_COMM_SYRK);
  iExpandBy2( 2, x, C2, nC2, C2sizes2, req, buf );
  stopTimer(TIMER_COMM_SYRK);

  if( alpha != 0 ) { // actually, this only works for alpha = 1
    daxpy_( &nOldSq, &done, expC21, &ione, C21, &ione );
    free(expC21);
  }

  daxpy_( &nOldSq, &done, C21c, &ione, C21, &ione );

  free(C21c);
  free(nC1);
  free(nC2);
  free(nA1);
  free(nA2);
  free(nA3);
}