mat mars(Agraph_t* g, struct marsopts opts) { int i, j, n = agnnodes(g), k = MIN(n, MAX(opts.k, 2)), iter = 0; mat dij, u, u_trans, q, r, q_t, tmp, tmp2, z; double* s = (double*) malloc(sizeof(double)*k); double* ones = (double*) malloc(sizeof(double)*n); double* d; int* anchors = (int*) malloc(sizeof(int)*k); int* clusters = NULL; double change = 1, old_stress = -1; dij = mat_new(k, n); u = mat_new(n,k); tmp = mat_new(n,k); darrset(ones,n,-1); select_anchors(g, dij, anchors, k); if(opts.color) { for(i = 0; i < k; i++) { Agnode_t* anchor = get_node(anchors[i]); agset(anchor, "color", "red"); } } if(opts.power != 1) { clusters = graph_cluster(g,dij,anchors); } singular_vectors(g, dij, opts.power, u, s); vec_scalar_mult(s, k, -1); u_trans = mat_trans(u); d = mat_mult_for_d(u, s, u_trans, ones); for(i = 0; i < u->c; i++) { double* col = mat_col(u,i); double* b = inv_mul_ax(d,col,u->r); for(j = 0; j < u->r; j++) { tmp->m[mindex(j,i,tmp)] = b[j]; } free(b); free(col); } tmp2 = mat_mult(u_trans,tmp); for(i = 0; i < k; i++) { tmp2->m[mindex(i,i,tmp2)] += (1.0/s[i]); } q = mat_new(tmp2->r, tmp2->c); r = mat_new(tmp2->c, tmp2->c); qr_factorize(tmp2,q,r); q_t = mat_trans(q); if(opts.given) { z = get_positions(g, opts.dim); } else { z = mat_rand(n, opts.dim); } translate_by_centroid(z); if(opts.viewer) { init_viewer(g, opts.max_iter); append_layout(z); } old_stress = stress(z, dij, anchors, opts.power); while(change > EPSILON && iter < opts.max_iter) { mat right_side; double new_stress; if(opts.power == 1) { right_side = barnes_hut(z); } else { right_side = barnes_hut_cluster(z, dij, clusters, opts.power); } for(i = 0; i < opts.dim; i++) { double sum = 0; double* x; double* b = mat_col(right_side,i); for(j = 0; j < right_side->r; j++) { sum += b[j]; } x = inv_mul_full(d, b, right_side->r, u, u_trans, q_t, r); for(j = 0; j < z->r; j++) { z->m[mindex(j,i,z)] = x[j] - sum/right_side->r; } free(x); free(b); } adjust_anchors(g, anchors, k, z); update_anchors(z, dij, anchors, opts.power); translate_by_centroid(z); if(opts.viewer) { append_layout(z); } new_stress = stress(z, dij, anchors, opts.power); change = fabs(new_stress-old_stress)/old_stress; old_stress = new_stress; mat_free(right_side); iter++; } mat_free(dij); mat_free(u); mat_free(u_trans); mat_free(q); mat_free(r); mat_free(q_t); mat_free(tmp); mat_free(tmp2); free(s); free(ones); free(d); free(anchors); free(clusters); return z; }
matrix_t * mpi_mat_rand( idx_t const mode, idx_t const nfactors, permutation_t const * const perm, rank_info * const rinfo) { idx_t const localdim = rinfo->mat_end[mode] - rinfo->mat_start[mode]; matrix_t * mymat = mat_alloc(localdim, nfactors); MPI_Status status; /* figure out buffer sizes */ idx_t maxlocaldim = localdim; if(rinfo->rank == 0) { MPI_Reduce(MPI_IN_PLACE, &maxlocaldim, 1, SPLATT_MPI_IDX, MPI_MAX, 0, rinfo->comm_3d); } else { MPI_Reduce(&maxlocaldim, NULL, 1, SPLATT_MPI_IDX, MPI_MAX, 0, rinfo->comm_3d); } /* root rank does the heavy lifting */ if(rinfo->rank == 0) { /* allocate buffers */ idx_t * loc_perm = splatt_malloc(maxlocaldim * sizeof(*loc_perm)); val_t * vbuf = splatt_malloc(maxlocaldim * nfactors * sizeof(*vbuf)); /* allocate initial factor */ matrix_t * full_factor = mat_rand(rinfo->global_dims[mode], nfactors); /* copy root's own matrix to output */ #pragma omp parallel for schedule(static) for(idx_t i=0; i < localdim; ++i) { idx_t const gi = rinfo->mat_start[mode] + perm->iperms[mode][i]; for(idx_t f=0; f < nfactors; ++f) { mymat->vals[f + (i*nfactors)] = full_factor->vals[f+(gi*nfactors)]; } } /* communicate! */ for(int p=1; p < rinfo->npes; ++p) { /* first receive layer start and permutation info */ idx_t layerstart; idx_t nrows; MPI_Recv(&layerstart, 1, SPLATT_MPI_IDX, p, 0, rinfo->comm_3d, &status); MPI_Recv(&nrows, 1, SPLATT_MPI_IDX, p, 1, rinfo->comm_3d, &status); MPI_Recv(loc_perm, nrows, SPLATT_MPI_IDX, p, 2, rinfo->comm_3d, &status); /* fill buffer */ #pragma omp parallel for schedule(static) for(idx_t i=0; i < nrows; ++i) { idx_t const gi = layerstart + loc_perm[i]; for(idx_t f=0; f < nfactors; ++f) { vbuf[f + (i*nfactors)] = full_factor->vals[f+(gi*nfactors)]; } } /* send to rank p */ MPI_Send(vbuf, nrows * nfactors, SPLATT_MPI_VAL, p, 3, rinfo->comm_3d); } mat_free(full_factor); splatt_free(loc_perm); splatt_free(vbuf); /* other ranks just send/recv */ } else { /* send permutation info to root */ MPI_Send(&(rinfo->layer_starts[mode]), 1, SPLATT_MPI_IDX, 0, 0, rinfo->comm_3d); MPI_Send(&localdim, 1, SPLATT_MPI_IDX, 0, 1, rinfo->comm_3d); MPI_Send(perm->iperms[mode] + rinfo->mat_start[mode], localdim, SPLATT_MPI_IDX, 0, 2, rinfo->comm_3d); /* receive factor */ MPI_Recv(mymat->vals, mymat->I * mymat->J, SPLATT_MPI_VAL, 0, 3, rinfo->comm_3d, &status); } return mymat; }
int main(int argc, char **argv) { srand(time(NULL)); Cache cache; double max_runtime; /* Overly slow ones commented out by default. */ MatMul mat_mul_funcs[] = { /*mat_mul_cpu,*/ mat_mul_cpu_trans, mat_mul_cpu_trans_vec, mat_mul_cpu_block, mat_mul_cpu_cblas, /*mat_mul_cl,*/ mat_mul_cl_row_priv, mat_mul_cl_row_local, mat_mul_cl_row_priv_col_local, mat_mul_cl_row_priv_cols_local, /* TODO broken for larger matrics, some cells contain trash. * Likey some memory overflow problem. */ /*mat_mul_cl_block,*/ mat_mul_cl_clblas, }; int first, func_done[NELEMS(mat_mul_funcs)] = {0}; size_t f, i; size_t mat_sizeof; /* CLI args. */ if (argc > 1) { max_runtime = strtod(argv[1], NULL); } else { max_runtime = 1.0; } common_init(&(cache.common), NULL); /* Unit test 2x2. */ { const F A[] = { 1.0, 2.0, 3.0, 4.0 }; const F B[] = { 5.0, 6.0, 7.0, 8.0 }; enum N { n = 2 }; F C[n*n]; const F C_ref[] = { 19.0, 22.0, 43.0, 50.0 }; cl_buf_init(&cache, n * n * sizeof(F)); for (f = 0; f < sizeof(mat_mul_funcs)/sizeof(mat_mul_funcs[0]); ++f) { mat_zero(C, n); mat_mul_funcs[f](A, B, C, n, &cache); mat_assert_eq(C, C_ref, n); } cl_buf_deinit(&cache); } /* Unit test 4x4. */ { const F A[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, }; const F B[] = { 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, }; const F C_ref[] = { 250.0, 260.0, 270.0, 280.0, 618.0, 644.0, 670.0, 696.0, 986.0, 1028.0, 1070.0, 1112.0, 1354.0, 1412.0, 1470.0, 1528.0, }; enum N { n = 4 }; F C[n*n]; cl_buf_init(&cache, n * n * sizeof(F)); for (f = 0; f < NELEMS(mat_mul_funcs); ++f) { mat_zero(C, n); mat_mul_funcs[f](A, B, C, n, &cache); mat_assert_eq(C, C_ref, n); } cl_buf_deinit(&cache); } /* Benchmarks. */ { double dt; F *A = NULL, *B = NULL, *C = NULL, *C_ref = NULL, *dst = NULL, *ref = NULL; int done; size_t n = 2; puts("#matmul"); done = 0; while(1) { printf("%zu ", (size_t)log2(n)); mat_sizeof = n * n * sizeof(F); /* CPU setup. */ A = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); B = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); C = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); C_ref = aligned_alloc(VECTOR_SIZEOF, mat_sizeof); if (NULL == A || NULL == B || NULL == C) { printf("error: could not allocate memory for n = %zu", n); break; } mat_rand(A, n); mat_rand(B, n); cl_buf_init(&cache, mat_sizeof); first = 1; for (f = 0; f < NELEMS(mat_mul_funcs); ++f) { if (func_done[f]) { printf("%*s", 10, ""); } else { if (first) { dst = C_ref; ref = NULL; first = 0; } else { dst = C; ref = C_ref; } dt = bench(mat_mul_funcs[f], A, B, dst, ref, n, &cache); if (dt > max_runtime) func_done[f] = 1; } } puts(""); done = 1; for (i = 0; i < NELEMS(mat_mul_funcs); ++i) { if (!func_done[i]) { done = 0; break; } } if (done) break; n *= 2; /* CPU deinit. */ free(A); free(B); free(C); free(C_ref); cl_buf_deinit(&cache); } common_deinit(&cache.common); } return EXIT_SUCCESS; }