Пример #1
0
static void chol_driver(blas_idx_t n_global)
{
    auto grid = std::make_shared<blacs_grid_t>();    
    auto a    = make_tridiagonal(grid, n_global);    

    // Compute Cholesky factorization of A in-place
    char       uplo     ='U';
    blas_idx_t ia       = 1, ja = 1, info;

    MPI_Barrier (MPI_COMM_WORLD);
    double t0 = MPI_Wtime();
    pdpotrf_ (uplo, n_global, a->local_data(), ia, ja, a->descriptor(), info);
    assert(info == 0);

    double t1 = MPI_Wtime() - t0;
  
    double t_glob;
    MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (grid->iam() == 0) 
    {
        double gflops = potrf_flops(n_global)/t_glob/grid->nprocs();
        printf("\n"
            "MATRIX CHOLESKY FACTORIZATION BENCHMARK SUMMARY\n"
            "===============================================\n"
            "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxPOTRF = %10.7f seconds\tGflops/Proc = %10.7f\n",
            n_global, grid->nprocs(), grid->nprows(), grid->npcols(), 
            t_glob, gflops);fflush(stdout);
    }
}
Пример #2
0
void Buddha::worker(uint64_t from, uint64_t to) {
    uint64_t filled = 0;
    std::vector<uint64_t> local_data(thread_vector_size_);
    std::size_t progress_local = 0;

    floating_type radius_sqr = radius_ * radius_;
    floating_type subpixel_width  = 2 * radius_ / x_size_;
    floating_type subpixel_height = 2 * radius_ / y_size_;

    for (uint64_t sub_x = 0; sub_x < subpixel_resolution_; ++sub_x) {
        for (uint64_t sub_y = 0; sub_y < subpixel_resolution_; ++sub_y) {
            for (uint64_t i = from; i < to; ++i) {

                ++progress_local;

                if (filled + max_iterations_ >= thread_vector_size_)
                    flush_data(local_data, filled);

                complex_type c = lin2complex(i);
                c.real(c.real() + sub_x * subpixel_width);
                c.imag(c.imag() + sub_y * subpixel_height);
                complex_type z = c;

                uint64_t pos = 0;
                if (mandelbrot_hint(c))
                    continue;

                while (z.real() * z.real() + z.imag() * z.imag() < radius_sqr
                    && pos < max_iterations_) {
                    // TODO: Possible optimization when computing abs(z)^2.

                    uint64_t zpos = complex2lin(z);

                    if (zpos < data_.size()) {
                        local_data[filled + pos] = zpos;
                    }

                    z *= z;
                    z += c;
                    ++pos;
                }

                if (pos >= min_iterations_ && pos < max_iterations_) {
                    filled += pos;
                }
            }

            if (progress_local > 10000) {
                progress_ += progress_local;
                progress_local = 0;
            }
        }
    }

    flush_data(local_data, filled);
}
Пример #3
0
void umat(TUmatData &umat_data)
{
    typedef LocalData<TUmatData> TLocalData;

    /// localdata
    TLocalData local_data(umat_data);
//     PRINT(local_data.epsilon);

    /// stiffness
    local_data.H0(0, 0) = local_data.E0;
//     PRINT(local_data.H0);

    /// stress
    local_data.sigma = local_data.H0 * local_data.epsilon;
//     PRINT(local_data.sigma);

    /// ener
    local_data.ener = dot_vec_col(local_data.sigma, local_data.epsilon)/2;
//     PRINT(local_data.ener);

} // void umat
Пример #4
0
void umat (TUmatData &umat_data)
{
    typedef LocalData<TUmatData> TLocalData;

    /// localdata
    TLocalData local_data(umat_data);
//     PRINT(local_data.epsilon);

    /// stiffness
    local_data.H0 = elasticity_isotrope_H<ndim> (
        local_data.E0,
        local_data.N0);
//     PRINT(local_data.H0);

    /// stress
    local_data.sigma = local_data.H0 * local_data.epsilon;
//     PRINT(local_data.sigma);

    /// ener
    local_data.ener = dot_vec_col(local_data.sigma, local_data.epsilon)/2;
//     PRINT(local_data.ener);

} // void umat
Пример #5
0
///  计算结果存储在矩阵a中
///  n_global: the order of the matrix
static void inv_driver(blas_idx_t n_global)		
{

    auto grid = std::make_shared<blacs_grid_t>();
	
	//// self code
	//n_global = 3;
	//double *aaa = new double(n_global*n_global);
	//for (int i = 0; i < 9; i++)
	//{
	//	aaa[i] = i + 1;
	//}
	//aaa[8] = 10;
	//auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa);


    // Create a NxN random matrix A
    auto a = block_cyclic_mat_t::random(grid, n_global, n_global);        

    // Create a NxN matrix to hold A^{-1}
    auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global);

    // Copy A to A^{-1} since it will be overwritten during factorization
    std::copy_n(a->local_data(), a->local_size(), ai->local_data());

    MPI_Barrier (MPI_COMM_WORLD);

    double t0 = MPI_Wtime();
    
    // Factorize A 
    blas_idx_t ia = 1, ja = 1;
    std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100);
    blas_idx_t info;

	//含义应该是D-GE-TRF。
	//第一个D表示我们的矩阵是double类型的
	//GE表示我们的矩阵是General类型的
	//TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。
    pdgetrf_(n_global, n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        info);
    assert(info == 0);
    double t_factor = MPI_Wtime() - t0;

    // Compute A^{-1} based on the LU factorization

    // Compute workspace for double and integer work arrays on each process
    blas_idx_t lwork  = 10;
    blas_idx_t liwork = 10;
    std::vector<double>     work (lwork); 
    std::vector<blas_idx_t> iwork(liwork);

    lwork = liwork = -1;   

	// 计算lwork与liwork的值
    pdgetri_(n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    lwork  = static_cast<blas_idx_t>(work[0]);
    liwork = static_cast<size_t>(iwork[0]);
    work.resize(lwork);
    iwork.resize(liwork);

    // Now compute the inverse
    t0 = MPI_Wtime();
    pdgetri_(n_global, 
        ai->local_data(), ia, ja, ai->descriptor(), 
        ipiv.data(), 
        work.data(), lwork, iwork.data(), liwork, info);
    assert(info == 0);
    double t_solve = MPI_Wtime() - t0;

    // Verify that the inverse is correct using A*A^{-1} = I
    auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global);

    // Compute I = A * A^{-1} - I and verify that the ||I|| is small    
    char nein = 'N';
    double alpha = 1.0, beta = -1.0;
    pdgemm_(nein, nein, n_global, n_global, n_global, alpha, 
        a->local_data() , ia, ja, a->descriptor(),
        ai->local_data(), ia, ja, ai->descriptor(),
        beta,
        identity->local_data(), ia, ja, identity->descriptor());

    // Compute 1-norm of the result
    char norm='1';
    work.resize(identity->local_cols());
    double err = pdlange_(norm, n_global, n_global, 
        identity->local_data(), ia, ja, identity->descriptor(), work.data());

    double t_total = t_factor + t_solve;
    double t_glob;
    MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

    if (grid->iam() == 0) 
    {
        double gflops = getri_flops(n_global)/t_glob/grid->nprocs();
        printf("\n"
            "MATRIX INVERSE BENCHMARK SUMMARY\n"
            "================================\n"
            "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n",
            n_global, grid->nprocs(), grid->nprows(), grid->npcols(), 
            t_glob, gflops, err);fflush(stdout);
    }
}
Пример #6
0
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global)
{
    auto grid = std::make_shared<blacs_grid_t>();

    auto a = block_cyclic_mat_t::random(grid, m_global, k_global);
  //  auto b = block_cyclic_mat_t::random(grid, k_global, n_global);
    auto c = block_cyclic_mat_t::random(grid, m_global, n_global);
	

	//for test TODO
	double *dd = new double[m_global*k_global];
	for (int i = 0; i < m_global*k_global; i++)
	{
		dd[i] = i;
	}
	auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd);
	//从这里开始矩阵的运算

    MPI_Barrier(MPI_COMM_WORLD);

    double alpha = 1.0, beta = 0.0;

    double t0 = MPI_Wtime();
    char NEIN = 'N';	//表示不进行转置
    blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1;

	// sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C)
    pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, 
        alpha, 
        a->local_data(), ia, ja, a->descriptor(), 
        b->local_data(), ib, jb, b->descriptor(),
        beta,
        c->local_data(), ic, jc, c->descriptor()  
		);
    
	double t1 = MPI_Wtime() - t0;
	double t_glob;
	//获取所有进程所用时间中最长的时间
    MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); 


    if (grid->iam() == 0) // 进程号为0的进程
    { 
        double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs();

        printf("\n"
            "MATRIX MULTIPLY BENCHMARK SUMMARY\n"
            "=================================\n"
            "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n"
            "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", 
            m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(),
            t_glob, gflops); fflush(stdout);

		for (int i = 0; i < 10; i++)
		{
			for (int j = 0; j < 10; j++)
			{
				printf("%f ", c->local_data()[i*k_global + j]);
			}
			printf("\n");
		}
		fflush(stdout);
    }
}
double HDP_MEDOIDS (vector<Instance*>& data, vector< vector<double> >& means, Lookups* tables, vector<double> lambdas, dist_func df, int FIX_DIM) {
    // STEP ZERO: validate input and initialization
    int N = tables->nWords;
    int D = tables->nDocs;
    vector< pair<int, int> > doc_lookup = *(tables->doc_lookup);
    double lambda_global = lambdas[0];
    double lambda_local = lambdas[1];

    vector< vector<double> > global_means (1, vector<double>(FIX_DIM, 0.0));
    vector< vector<int> > k (D, vector<int>(1,0));  // global association
    vector<int> z (N, 0); // local assignment
    vector<int> global_asgn (N, 0); // global assignment

    // STEP ONE: a. set initial *global* medoid as global mean
    compute_assignment (global_asgn, k, z, tables);
    compute_means (data, global_asgn, FIX_DIM, global_means);

    double last_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
    double new_cost = last_cost;
    while (true) {
        // 4. for each point x_ij,
        for (int j = 0; j < D; j ++) {
            for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i++) {
                int num_global_means = global_means.size();
                vector<double> d_ij (num_global_means, 0.0);
                for (int p = 0; p < num_global_means; p ++) {
                    Instance* temp_ins = vec2ins(global_means[p]);
                    double euc_dist = df(data[i], temp_ins, FIX_DIM);
                    d_ij[p] = euc_dist * euc_dist;
                    delete temp_ins;
                }
                set<int> temp;
                for (int p = 0; p < num_global_means; p ++) temp.insert(p);
                int num_local_means = k[j].size();
                for (int q = 0; q < num_local_means; q ++) temp.erase(k[j][q]);
                set<int>::iterator it; 
                for (it=temp.begin(); it!=temp.end();++it) d_ij[*it] += lambda_local;
                int min_p = -1; double min_dij = INF;
                for (int p = 0; p < num_global_means; p ++) 
                    if (d_ij[p] < min_dij) {
                        min_p = p;
                        min_dij = d_ij[p];
                    }
                if (min_dij > lambda_global + lambda_local) {
                    z[i] = num_local_means; 
                    k[j].push_back(num_global_means);
                    vector<double> new_g(FIX_DIM, 0.0);
                    for (int f = 0; f < data[i]->fea.size(); f++)
                        new_g[data[i]->fea[f].first-1] = data[i]->fea[f].second;
                    global_means.push_back(new_g);
                    // cout << "global and local increment" << endl;
                } else {
                    bool c_exist = false;
                    for (int c = 0; c < num_local_means; c ++) 
                        if (k[j][c] == min_p) {
                            z[i] = c;
                            c_exist = true;
                            break;
                        }
                    if (!c_exist) {
                        z[i] = num_local_means;
                        k[j].push_back(min_p);
                       // cout << "local increment" << endl;
                    }
                }
            }
        }
        /*
        cout << "half..........." << endl;
        cout << "#global created: " << global_means.size() 
            << ", #global used: " << get_num_global_means(k);
            */
        new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
        // 5. for all local clusters,
        for (int j = 0; j < D; j ++) {
            int begin_i = doc_lookup[j].first;
            int end_i = doc_lookup[j].second;
            int doc_len = doc_lookup[j].second - doc_lookup[j].first;
            int num_local_means = k[j].size();

            // all local clusters are distinct to each other
            /*
            set<int> temp;
            for (int y = 0; y < num_local_means; y++)
                temp.insert(k[j][y]);
            cout << temp.size() << " ==? " << num_local_means << endl;
            assert (temp.size() == num_local_means);
            */

            // compute means of local clusters
            vector< vector<double> > local_means (num_local_means, vector<double>(FIX_DIM, 0.0));
            vector<int> local_asgn (z.begin()+begin_i, z.begin()+end_i);
            vector<Instance*> local_data (data.begin()+begin_i,data.begin()+end_i);
            compute_means (local_data, local_asgn, FIX_DIM, local_means);
            assert (num_local_means == local_means.size());

            // pre-compute instances for global means 
            int num_global_means = global_means.size();
            vector<Instance*> temp_global_means (num_global_means, NULL);
            for (int p = 0; p < num_global_means; p ++) 
                temp_global_means[p] = vec2ins (global_means[p]);

            // pre-compute instances for local means 
            vector<Instance*> temp_local_means (num_local_means, NULL);
            for (int c = 0; c < num_local_means; c ++) 
                temp_local_means[c] = vec2ins (local_means[c]);

            for (int c = 0; c < num_local_means; c++) {
                // compute distance of local clusters to each global cluster
                num_global_means = global_means.size();
                vector<double> d_jcp (num_global_means, 0.0);
                double sum_d_ijc = 0.0; 
                for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i ++) {
                    if (z[i] != c) continue;
                    double local_dist = df (data[i], temp_local_means[c], FIX_DIM);
                    sum_d_ijc += local_dist * local_dist;
                    for (int p = 0; p < num_global_means; p ++) {
                        double dist = df (data[i], temp_global_means[p], FIX_DIM);
                        d_jcp[p] += dist * dist;
                    }
                }
                int min_p = -1; double min_d_jcp = INF;
                for (int p = 0; p < num_global_means; p ++) 
                    if (d_jcp[p] < min_d_jcp) {
                        min_p = p;
                        min_d_jcp = d_jcp[p];
                    }
                assert (min_p >= 0);
                // cout << min_d_jcp << " " << lambda_global << " " << sum_d_ijc << endl;
                if (min_d_jcp > lambda_global + sum_d_ijc) {
                    global_means.push_back(local_means[c]); //  push mu_jc
                    temp_global_means.push_back(vec2ins (local_means[c]));
                    k[j][c] = num_global_means;
                    // cout << "global increment" << endl;
                } else {
                    k[j][c] = min_p;
                }
            }
            for (int c = 0; c < num_local_means; c ++) 
                delete temp_local_means[c];
            num_global_means = global_means.size();
            for (int p = 0; p < num_global_means; p ++) 
                delete temp_global_means[p];
        }
        // 6. for each global clusters,
        compute_assignment (global_asgn, k, z, tables);
        /*
        cout << "compute global means.." << endl;
        cout << "#global created: " << global_means.size() 
            << ", #global used: " << get_num_global_means(k);
            */
        compute_means (data, global_asgn, FIX_DIM, global_means);

        // 7. convergence?
        new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
        if ( new_cost < objmin ) objmin = new_cost;
        objmin_trace << omp_get_wtime()-start_time << " " << objmin << endl;
        if (new_cost == last_cost)
            break;
        if (new_cost < last_cost) {
            last_cost = new_cost;
        } else {
            cerr << "failure" << endl;
            return INF;
            assert(false);    
        }
    }
    means = global_means;
    return last_cost;
}// entry main function