Example #1
0
void cpu_benchmark(int n, int samples, double &duration, double &GFLOPS)
{
    type_precision* A = new type_precision[n * n];
    type_precision* B = new type_precision[n * n];
    type_precision* C = new type_precision[n * n];

    cputime_type start_tick, end_tick;
    duration = 9999999999.0;
    int b = 0;

    for (int i = 0; i < samples; i++)
    {
        re_random_vec(A, n*n);
        re_random_vec(B, n*n);
        re_random_vec(C, n*n);

        get_ticks(start_tick);
        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
                 1.0, A, n, B, n, 1.0, C, n);
        get_ticks(end_tick);
        duration = min(duration, static_cast<double>(ticks2sec(end_tick, start_tick)));
        int a = 0;
        for (int j = 0; j < n * n ; j++)
        {
            a += A[j] + B[j] + C[j];
        }
        b += a;
    }
    //!2nnn - nn + 2nn (from+c)
    GFLOPS = gemm_flops(n, n, n, 0);

    cout << b;

    delete []A;
    delete []B;
    delete []C;
}
Example #2
0
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global)
{
    auto grid = std::make_shared<blacs_grid_t>();

    auto a = block_cyclic_mat_t::random(grid, m_global, k_global);
  //  auto b = block_cyclic_mat_t::random(grid, k_global, n_global);
    auto c = block_cyclic_mat_t::random(grid, m_global, n_global);
	

	//for test TODO
	double *dd = new double[m_global*k_global];
	for (int i = 0; i < m_global*k_global; i++)
	{
		dd[i] = i;
	}
	auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd);
	//从这里开始矩阵的运算

    MPI_Barrier(MPI_COMM_WORLD);

    double alpha = 1.0, beta = 0.0;

    double t0 = MPI_Wtime();
    char NEIN = 'N';	//表示不进行转置
    blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1;

	// sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C)
    pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, 
        alpha, 
        a->local_data(), ia, ja, a->descriptor(), 
        b->local_data(), ib, jb, b->descriptor(),
        beta,
        c->local_data(), ic, jc, c->descriptor()  
		);
    
	double t1 = MPI_Wtime() - t0;
	double t_glob;
	//获取所有进程所用时间中最长的时间
    MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 


    if (grid->iam() == 0) // 进程号为0的进程
    { 
        double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs();

        printf("\n"
            "MATRIX MULTIPLY BENCHMARK SUMMARY\n"
            "=================================\n"
            "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", 
            m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(),
            t_glob, gflops); fflush(stdout);

		for (int i = 0; i < 10; i++)
		{
			for (int j = 0; j < 10; j++)
			{
				printf("%f ", c->local_data()[i*k_global + j]);
			}
			printf("\n");
		}
		fflush(stdout);
    }
}