int least_squares(int m, int n, double **matrix) { if(m < 1 || n < 1 || n > m) return LS_DIMENSION_ERROR; int i, j; int info, lwork = m*m; double *work; if(!allocate_double_vector(&work, lwork)) { return LS_MEMORY_ERROR; } char transa = 'N'; double **a_matrix; if(!allocate_double_matrix(&a_matrix, n, m)) { return LS_MEMORY_ERROR; } for(i = 0; i < m; i ++) for(j = 0; j < n; j ++) a_matrix[j][i] = matrix[j][i]; double **b_matrix; if(!allocate_double_matrix(&b_matrix, m, m)) { return LS_MEMORY_ERROR; } for(i = 0; i < m; i ++) for(j = 0; j < m; j ++) b_matrix[i][j] = (i == j); dgels_(&transa, &m, &n, &m, a_matrix[0], &m, b_matrix[0], &m, work, &lwork, &info); for(i = 0; i < n; i ++) for(j = 0; j < m; j ++) matrix[i][j] = b_matrix[j][i]; free_matrix((void**)a_matrix); free_matrix((void**)b_matrix); free_vector(work); return LS_SUCCESS; }
double frobenius_check(double *known, double *computed, int m, int n, int id, int np) { int l_num_elements = m*n / np; if (computed == NULL) { return euclidean_norm(known, m*n, id, np); } double *difference = allocate_double_vector(l_num_elements); for (int i = 0; i < l_num_elements; i++) { difference[i] = computed[i] - known[i]; } double e_norm = euclidean_norm(difference, m*n, id, np); free(difference); return e_norm; }
void parallel_blas3_product(double *A, double *B, double *C, int m, int k, int n, int id, int np) { if (k % np != 0) { if (id == 0) fprintf(stderr, "k is not divisible by np.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Status status; int l_k = k / np; double *l_A = allocate_double_vector(l_k * m); double *l_B = allocate_double_vector(l_k * k); MPI_Datatype block_col_t; MPI_Datatype block_row_t; // for blocks in B = k x n MPI_Type_vector( n, // count = number of blocks, i.e. length of column * l_k(num rows) l_k, // blocklen = number of things in each block k, // stride = difference between start of blocks MPI_DOUBLE, // old datatype &block_row_t // new datatype ); MPI_Type_commit(&block_row_t); // for column of A= m x k MPI_Type_contiguous( m * l_k, // count = number of items MPI_DOUBLE, // old_type = type of items &block_col_t // new_mpi_type = the new datatype ); MPI_Type_commit(&block_col_t); if (id == 0) { // copy correct elements from A to l_A memcpy(l_A, A, sizeof(double) * l_k * m); for (int i = 1; i < np; ++i) { MPI_Send(&(A[0 + m*(i*l_k)]), 1, block_col_t, i, 0, MPI_COMM_WORLD); } } else { MPI_Recv(l_A, (m*l_k), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status); } if (id == 0) { // copy numbers from B to l_B for (int col = 0; col < n; ++col) { for (int row = 0; row < l_k; ++row) { l_B[row + l_k*col] = B[row + k*col]; } } for (int i = 1; i < np; ++i) { MPI_Send(&(B[i*l_k]), 1, block_row_t, i, 0, MPI_COMM_WORLD); } } else { MPI_Recv(l_B, (l_k*n), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status); } /* //debugging only for (int i = 0; i < l_k*n; ++i) { printf("[%i]: Row l_B[%i]=%f\n", id, i, l_B[i]); } for (int i = 0; i < l_k*m; ++i) { printf("[%i]: Col l_A[%i]=%f\n", id, i, l_A[i]); } */ // C only matters on process 0 and should be allocated outside this function double *local_C = allocate_double_vector(m*n); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m, n, l_k, 1, l_A, m, l_B, l_k, 0, local_C, m); MPI_Reduce(local_C, C, m*n, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); free(local_C); free(l_A); free(l_B); }
int main(int argc, char **argv) { int i, j, id, np, processor_name_len; int maxit, idleft, idright, flag, iter; char processor_name[MPI_MAX_PROCESSOR_NAME]; int dimensions, N, l_N, n, l_n; double *l_r, *l_u, *l_p, *l_q; double *x, *y, *gl, *gr; double h, tol, relres; MPI_Init(&argc, &argv); /* Check processes: */ MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Comm_rank(MPI_COMM_WORLD, &id); MPI_Get_processor_name(processor_name, &processor_name_len); /* process command-line inputs: */ if (argc != 3) { if (id == 0) { printf("Usage: \"./poisson N dim\" \n"); } MPI_Abort(MPI_COMM_WORLD, 1); } N = atoi(argv[1]); dimensions = atoi(argv[2]); if (dimensions == 2) { n = N*N; } else if (dimensions == 3) { n = N*N*N; } else { printf("Error: dimensions must equal 2 or 3"); MPI_Abort(MPI_COMM_WORLD, 1); } /* number of processes np must divide n: */ if ((n % np) != 0) { if (id == 0) { printf("Error: np must divide n!\n"); printf(" n = %d, np = %d, n%%np = %d\n", n, np, (n%np)); } MPI_Abort(MPI_COMM_WORLD, 1); } /* calculate size of local blocks: */ l_N = N / np; l_n = n / np; if (id == 0) { printf("n = %d, np = %d, l_n = %d\n", n, np, l_n); printf("\n"); fflush(stdout); } /* vectors l_u, l_r, l_p, l_q should be of length l_n and gl and gr should be of length l_n / l_N */ x = allocate_double_vector(N); /* x is a N length vector*/ y = allocate_double_vector(N); /* y is a N length vector*/ gl = allocate_double_vector(l_n / l_N); gr = allocate_double_vector(l_n / l_N); l_u = allocate_double_vector(l_n); l_r = allocate_double_vector(l_n); l_p = allocate_double_vector(l_n); l_q = allocate_double_vector(l_n); double start_time, end_time; /*Beginning of cg method: follows matlab code*/ h = 1.0 / (N + 1.0); for (i = 1; i <= N; i++) { x[i - 1] = i*h; y[i - 1] = i*h; } /*Setup B*/ setupB(l_r, x, y, l_N, N, h, id); tol = 1.0e-6; maxit = 99999; if (id>0) { idleft = id - 1; } else { idleft = MPI_PROC_NULL; } if (id<np - 1) { idright = id + 1; } else { idright = MPI_PROC_NULL; } MPI_Barrier(MPI_COMM_WORLD); start_time = MPI_Wtime(); cg(l_u, &flag, &relres, &iter, /*output*/ l_r, tol, maxit, /*input*/ l_p, l_q, l_n, l_N, N, id, idleft, idright, np, MPI_COMM_WORLD, gl, gr); MPI_Barrier(MPI_COMM_WORLD); end_time = MPI_Wtime(); double l_diff_norm = fd_norm(l_u, x, y, l_N, N, h, id); double diff_norm; MPI_Reduce(&l_diff_norm, &diff_norm, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); /* double *full; if (id == 0) full = allocate_double_vector(n); MPI_Gather(l_u, l_n, MPI_DOUBLE, full, l_n, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (id == 0) { for (int qrx = 0; qrx < n; qrx++) printf("u[%i]=%f\n", qrx, full[qrx]); free_vector(full); } */ if (id == 0) { printf("%15s %15s %22s %15s %22s\n", "N", "DOF", "relres", "iter", "time"); printf("%15d %15.f %22.16e %15d %22.16e\n", N, (double)N*N, relres, iter, (end_time - start_time)); printf("||u-u_h||=%22.16e\n", diff_norm); /* printf("N: %d\n", N); printf("DOF: %d\n", N*(double)N); printf("relres: %22.16e\n", relres); printf("iter: %d\n", iter); printf("elapsed time: %22.16e\n", (end_time - start_time)); */ } free_vector(x); free_vector(y); free_vector(l_r); free_vector(gl); free_vector(gr); free_vector(l_u); free_vector(l_p); free_vector(l_q); MPI_Finalize(); return 0; }
int constrained_least_squares(int m, int n, double **matrix, int c, int *constrained) { //check problem dimensions if(m < 1 || n < 1 || n > m || c > n) return LS_DIMENSION_ERROR; //counters int i, j; //extra problem dimensions int f = m - c, u = n - c; //lapack and blas inputs char transa, transb; double alpha, beta; //lapack output int info; //lapack workspace int lwork = m*m; double *work; if(!allocate_double_vector(&work, lwork)) { return LS_MEMORY_ERROR; } //lapack LU pivot indices int *ipiv; if(!allocate_integer_vector(&ipiv,c)) { return LS_MEMORY_ERROR; } //lapack coefficients of QR elementary reflectors double *tau; if(!allocate_double_vector(&tau,c)) { return LS_MEMORY_ERROR; } //matrices used double **t_matrix; if(!allocate_double_matrix(&t_matrix, m, m)) { return LS_MEMORY_ERROR; } double **c_matrix; if(!allocate_double_matrix(&c_matrix, n, n)) { return LS_MEMORY_ERROR; } double **r_matrix; if(!allocate_double_matrix(&r_matrix, c, c)) { return LS_MEMORY_ERROR; } double **a_matrix; if(!allocate_double_matrix(&a_matrix, n, f)) { return LS_MEMORY_ERROR; } double **d_matrix; if(!allocate_double_matrix(&d_matrix, f, f)) { return LS_MEMORY_ERROR; } //indices of unconstrained equations int *temp, *unconstrained; if(!allocate_integer_vector(&temp,m)) { return LS_MEMORY_ERROR; } if(!allocate_integer_vector(&unconstrained,f)) { return LS_MEMORY_ERROR; } //create vector of unconstrained indices for(i = 0; i < m; i ++) temp[i] = 0; for(i = 0; i < c; i ++) temp[constrained[i]] = 1; j = 0; for(i = 0; i < m; i ++) if(!temp[i]) unconstrained[j++] = i; //copy unconstrained equations from input matrix -> t_matrix for(i = 0; i < f; i ++) for(j = 0; j < n; j ++) t_matrix[i][j] = matrix[j][unconstrained[i]]; //copy constrained equations from input matrix -> c_matrix for(i = 0; i < c; i ++) for(j = 0; j < n; j ++) c_matrix[i][j] = matrix[j][constrained[i]]; //QR decomposition of the transposed constrained equations -> c_matrix dgeqrf_(&n, &c, c_matrix[0], &n, tau, work, &lwork, &info); //copy R out of the above QR decomposition -> r_matrix for(i = 0; i < c; i ++) for(j = 0; j < c; j ++) r_matrix[i][j] = ((j >= i) ? c_matrix[j][i] : 0); //form the square matrix Q from the above QR decomposition -> c_matrix' dorgqr_(&n, &n, &c, c_matrix[0], &n, tau, work, &lwork, &info); //multiply unconstrained eqations by Q -> a_matrix' transa = 'T'; transb = 'N'; alpha = 1.0; beta = 0.0; dgemm_(&transa, &transb, &f, &n, &n, &alpha, t_matrix[0], &m, c_matrix[0], &n, &beta, a_matrix[0], &f); //invert R' of the above QR decomposition -> r_matrix dgetrf_(&c, &c, r_matrix[0], &c, ipiv, &info); dgetri_(&c, r_matrix[0], &c, ipiv, work, &lwork, &info); //LS inversion of the non-square parts from unconstrained * Q -> d_matrix' for(i = 0; i < f; i ++) for(j = 0; j < u; j ++) t_matrix[j][i] = a_matrix[j+c][i]; for(i = 0; i < f; i ++) for(j = 0; j < f; j ++) d_matrix[i][j] = (i == j); transa = 'N'; dgels_(&transa, &f, &u, &f, t_matrix[0], &m, d_matrix[0], &f, work, &lwork, &info); //multiply matrices together to form the CLS solution -> t_matrix' transa = transb = 'N'; alpha = 1.0; beta = 0.0; dgemm_(&transa, &transb, &n, &f, &u, &alpha, c_matrix[c], &n, d_matrix[0], &f, &beta, t_matrix[0], &m); alpha = -1.0; beta = 1.0; dgemm_(&transa, &transb, &n, &c, &f, &alpha, t_matrix[0], &m, a_matrix[0], &f, &beta, c_matrix[0], &n); alpha = 1.0; beta = 0.0; dgemm_(&transa, &transb, &n, &c, &c, &alpha, c_matrix[0], &n, r_matrix[0], &c, &beta, t_matrix[f], &m); //copy the result out of the temporary matrix -> matrix for(i = 0; i < n; i ++) for(j = 0; j < f; j ++) matrix[i][unconstrained[j]] = t_matrix[j][i]; for(i = 0; i < n; i ++) for(j = 0; j < c; j ++) matrix[i][constrained[j]] = t_matrix[j+f][i]; //clean up and return successful free_vector(work); free_vector(ipiv); free_vector(tau); free_vector(temp); free_vector(unconstrained); free_matrix((void **)t_matrix); free_matrix((void **)c_matrix); free_matrix((void **)r_matrix); free_matrix((void **)a_matrix); free_matrix((void **)d_matrix); return LS_SUCCESS; }
void calculate_cell_reconstruction_matrices(int n_variables, double *weight_exponent, int *maximum_order, struct FACE *face, int n_cells, struct CELL *cell, struct ZONE *zone) { int c, u, i, j, k, l; int order, n_powers, n_stencil; //find the overall maximum order int maximum_maximum_order = 0; for(u = 0; u < n_variables; u ++) if(maximum_order[u] > maximum_maximum_order) maximum_maximum_order = maximum_order[u]; //cell structure allocation for(c = 0; c < n_cells; c ++) exit_if_false(cell_matrix_new(n_variables, &cell[c]),"allocating cell matrices"); //numerics values double **matrix, *weight; int n_constraints, *constraint; exit_if_false(allocate_double_matrix(&matrix,ORDER_TO_POWERS(maximum_maximum_order),MAX_STENCIL),"allocating matrix"); exit_if_false(allocate_integer_vector(&constraint,MAX_STENCIL),"allocating constraints"); exit_if_false(allocate_double_vector(&weight,MAX_STENCIL),"allocating weights"); //stencil element properties int s_id, s_index; struct ZONE *s_zone; char s_location, *s_condition; double s_area, *s_centroid, s_weight; //integration double x[2]; int differential[2], d; //CV polygon int n_polygon; double ***polygon; exit_if_false(allocate_double_pointer_matrix(&polygon,MAX(MAX_CELL_FACES,4),2),"allocating polygon memory"); for(c = 0; c < n_cells; c ++) { for(u = 0; u < n_variables; u ++) { //problem size order = cell[c].order[u]; n_powers = ORDER_TO_POWERS(order); n_stencil = cell[c].n_stencil[u]; n_constraints = 0; for(i = 0; i < n_stencil; i ++) { //stencil element properties s_id = cell[c].stencil[u][i]; s_index = ID_TO_INDEX(s_id); s_zone = &zone[ID_TO_ZONE(s_id)]; s_location = s_zone->location; s_condition = s_zone->condition; if(s_location == 'f') { s_centroid = face[s_index].centroid; s_area = face[s_index].area; } else if(s_location == 'c') { s_centroid = cell[s_index].centroid; s_area = cell[s_index].area; } else exit_if_false(0,"recognising zone location"); s_weight = (s_centroid[0] - cell[c].centroid[0])*(s_centroid[0] - cell[c].centroid[0]); s_weight += (s_centroid[1] - cell[c].centroid[1])*(s_centroid[1] - cell[c].centroid[1]); s_weight = 1.0/pow(s_weight,0.5*weight_exponent[u]); if(s_location == 'c' && s_index == c) s_weight = 1.0; weight[i] = s_weight; //unknown and dirichlet conditions have zero differentiation differential[0] = differential[1] = 0; //other conditions have differential determined from numbers of x and y-s in the condition string if(s_condition[0] != 'u' && s_condition[0] != 'd') { j = 0; while(s_condition[j] != '\0') { differential[0] += (s_condition[j] == 'x'); differential[1] += (s_condition[j] == 'y'); j ++; } } //index for the determined differential d = differential_index[differential[0]][differential[1]]; //unknowns if(s_condition[0] == 'u') { /*//fit unknowns to centroid points x[0] = s_centroid[0] - cell[c].centroid[0]; x[1] = s_centroid[1] - cell[c].centroid[1]; for(j = 0; j < n_powers; j ++) { matrix[j][i] = polynomial_coefficient[d][j]* integer_power(x[0],polynomial_power_x[d][j])* integer_power(x[1],polynomial_power_y[d][j])* s_weight; }*/ //fit unknowns to CV average n_polygon = generate_control_volume_polygon(polygon, s_index, s_location, face, cell); for(j = 0; j < n_powers; j ++) matrix[j][i] = 0.0; for(j = 0; j <= order; j ++) { for(k = 0; k < n_polygon; k ++) { x[0] = 0.5*polygon[k][0][0]*(1.0 - gauss_x[order][j]) + 0.5*polygon[k][1][0]*(1.0 + gauss_x[order][j]) - cell[c].centroid[0]; x[1] = 0.5*polygon[k][0][1]*(1.0 - gauss_x[order][j]) + 0.5*polygon[k][1][1]*(1.0 + gauss_x[order][j]) - cell[c].centroid[1]; for(l = 0; l < n_powers; l ++) { //[face integral of polynomial integrated wrt x] * [x normal] / [CV area] matrix[l][i] += polynomial_coefficient[d][l] * (1.0 / (polynomial_power_x[d][l] + 1.0)) * integer_power(x[0],polynomial_power_x[d][l]+1) * integer_power(x[1],polynomial_power_y[d][l]) * s_weight * gauss_w[order][j] * 0.5 * (polygon[k][1][1] - polygon[k][0][1]) / s_area; } } } } //knowns else { //known faces fit to face average if(s_location == 'f') { for(j = 0; j < n_powers; j ++) matrix[j][i] = 0.0; for(j = 0; j < order; j ++) { x[0] = 0.5*face[s_index].node[0]->x[0]*(1.0 - gauss_x[order-1][j]) + 0.5*face[s_index].node[1]->x[0]*(1.0 + gauss_x[order-1][j]) - cell[c].centroid[0]; x[1] = 0.5*face[s_index].node[0]->x[1]*(1.0 - gauss_x[order-1][j]) + 0.5*face[s_index].node[1]->x[1]*(1.0 + gauss_x[order-1][j]) - cell[c].centroid[1]; for(k = 0; k < n_powers; k ++) { matrix[k][i] += polynomial_coefficient[d][k] * integer_power(x[0],polynomial_power_x[d][k]) * integer_power(x[1],polynomial_power_y[d][k]) * s_weight*gauss_w[order-1][j]*0.5; } } } //cells need implementing //if(s_location == 'c') //{ //} } //constraints are the centre cell and any dirichlet boundaries if((s_location == 'c' && s_index == c) || s_condition[0] == 'd') constraint[n_constraints++] = i; } //solve if(n_constraints > 0) exit_if_false(constrained_least_squares(n_stencil,n_powers,matrix,n_constraints,constraint) == LS_SUCCESS, "doing CLS calculation"); else exit_if_false(least_squares(n_stencil,n_powers,matrix) == LS_SUCCESS,"doing LS calculation"); //multiply by the weights for(i = 0; i < n_powers; i ++) for(j = 0; j < n_stencil; j ++) matrix[i][j] *= weight[j]; //store in the cell structure for(i = 0; i < n_powers; i ++) for(j = 0; j < n_stencil; j ++) cell[c].matrix[u][i][j] = matrix[i][j]; } } //clean up free_matrix((void**)matrix); free_vector(constraint); free_vector(weight); free_matrix((void**)polygon); }