int main(int argc, char* argv[]) { int j, k, n, n2, i3, n3, iter, niter; float **a, *e, **v, s2; sf_file mat, val, eig; sf_init(argc,argv); mat = sf_input("in"); val = sf_output("out"); if (SF_FLOAT != sf_gettype(mat)) sf_error("Need float input"); if (!sf_histint(mat,"n1",&n)) sf_error("No n1= in input"); if (!sf_histint(mat,"n2",&n2) || n2 != n) sf_error("Need n1=n2 in input"); n3 = sf_leftsize(mat,2); sf_putint(val,"n2",1); if (!sf_getint("niter",&niter)) niter=10; a = sf_floatalloc2(n,n); e = sf_floatalloc(n); if (NULL != sf_getstring("eig")) { eig = sf_output("eig"); /* eigenvectors */ v = sf_floatalloc2(n,n); for (j=0; j < n; j++) { for (k=0; k < n; k++) { v[j][k] = (j==k)? 1.0:0.0; } } } else { eig = NULL; v = NULL; } jacobi_init(n); for (i3=0; i3 < n3; i3++) { sf_floatread(a[0],n*n,mat); for (iter=0; iter < niter; iter++) { s2 = 0.; for (j=0; j < n-1; j++) { for (k=j+1; k < n; k++) { s2 += jacobi(a,j,k,v); } } sf_warning("iter=%d s2=%g",iter+1,s2); } for (j=0; j < n; j++) { e[j]=a[j][j]; } sf_floatwrite(e,n, val); if (NULL != v) sf_floatwrite(v[0],n*n, eig); } exit(0); }
// Jacobi solver kernels void run_jacobi_init( Chunk* chunk, Settings* settings, double rx, double ry) { START_PROFILING(settings->kernel_profile); jacobi_init( chunk->x, chunk->y, settings->halo_depth, settings->coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u, chunk->kx, chunk->ky); STOP_PROFILING(settings->kernel_profile, __func__); }
void jacobi( size_t n , size_t iterations, size_t block_size, std::string output_filename) { hpx::util::high_resolution_timer t; vector< vector< vector< shared_future<block> > > > blockList(2); jacobi_init(blockList, n, block_size); size_t numBlocks = blockList[0].size(); for(size_t i = 1; i < iterations; ++i) { const size_t prev = i%2; const size_t curr = (i+1)%2; blockList[curr][0][0] = dataflow( jacobi_BL, blockList[prev][0][0], blockList[prev][0][1], blockList[prev][1][0] ); for(size_t j = 1; j < numBlocks - 1; j++) { blockList[curr][j][0] = dataflow( jacobi_left, blockList[prev][j ][0], blockList[prev][j ][1], blockList[prev][j-1][0], blockList[prev][j+1][0] ); } blockList[curr][numBlocks-1][0] = dataflow( jacobi_TL, blockList[prev][numBlocks-1][0], blockList[prev][numBlocks-1][1], blockList[prev][numBlocks-2][0] ); for(size_t j = 1; j < numBlocks - 1; j++) { blockList[curr][0][j] = dataflow( jacobi_bot, blockList[prev][0][j ], blockList[prev][0][j-1], blockList[prev][0][j+1], blockList[prev][1][j ] ); for(size_t k = 1; k < numBlocks - 1; k++) { blockList[curr][j][k] = dataflow( jacobi_op, blockList[prev][k ][j ], blockList[prev][k ][j-1], blockList[prev][k ][j+1], blockList[prev][k-1][j ], blockList[prev][k+1][j ]); } blockList[curr][numBlocks-1][j] = dataflow( jacobi_top, blockList[prev][numBlocks-1][j ], blockList[prev][numBlocks-1][j-1], blockList[prev][numBlocks-1][j+1], blockList[prev][numBlocks-2][j ] ); } blockList[curr][0][numBlocks-1] = dataflow( jacobi_BR, blockList[prev][0][numBlocks-1], blockList[prev][0][numBlocks-2], blockList[prev][1][numBlocks-1]); for(size_t j = 1; j < numBlocks - 1; j++) { blockList[curr][j][numBlocks-1] = dataflow( jacobi_left, blockList[prev][j ][numBlocks-1], blockList[prev][j ][numBlocks-2], blockList[prev][j-1][numBlocks-1], blockList[prev][j+1][numBlocks-1]); } blockList[curr][numBlocks-1][numBlocks-1] = dataflow( jacobi_TR, blockList[prev][numBlocks-1][numBlocks-1], blockList[prev][numBlocks-1][numBlocks-2], blockList[prev][numBlocks-2][numBlocks-1]); } for(int i = 0; i < blockList[(n-1)%2].size(); i++) { hpx::wait_all(blockList[(n-1)%2][i]); } report_timing(n, iterations, t.elapsed()); //output_grid(output_filename, *grid_old, n); }
int main(int argc, char** argv) { max_file_t *max_file = jacobi_init(); size_t dim = 64; // this should be a scalar input in the bitstream size_t MAX_ITER = 20; size_t C = max_get_constant_uint64t(max_file, "C"); size_t blks = 100; size_t total_equations = blks*C; clock_t engine_start = 0; clock_t engine_end = 0; double engine_total_time = 0.0; size_t max_dim = max_get_constant_uint64t(max_file, "maxDimLen"); if(argc == 1) { fprintf(stderr, "====>Info:Runing Jacobi with default parameter values:[Dimension = %ld, Iteration = %ld, blocks = %ld(%ld*%ld equations)], for details, see the README.txt\n", dim, MAX_ITER, blks, blks, C); } char *opt_str = "hd:b:i:"; int opt = 0; int input_dim = dim; int input_iter = MAX_ITER; int input_blks = blks; while( (opt = getopt(argc, argv, opt_str)) != -1) { switch(opt) { case 'd': input_dim = atoi(optarg); break; case 'b': input_blks = atoi(optarg); break; case 'i': input_iter = atoi(optarg); break; case 'h': usage(); return 1; default: fprintf(stderr, "====>Error: Inputs contain invalid command line paramter(s)!\n"); usage(); return 1; } } max_file_free(max_file); if(input_dim <= 0 || input_dim > max_dim || input_dim % 2 != 0) { fprintf(stderr, "\n====>Error: Input dimension length is invalid, for details, see the usage below:\n"); usage(); return 1; } else { dim = (size_t)input_dim; } if(input_blks <= 0) { fprintf(stderr, "\n====>Error: Input block number is invalid, should bigger than zero.\n"); usage(); return 1; } else { blks = (size_t)input_blks; } if(input_iter <= 1) { fprintf(stderr, "\n====>Error: Input iteration number is invalid, should bigger than 1.\n"); usage(); return 1; } else { MAX_ITER = (size_t)input_iter; } total_equations = blks * C; double *A = malloc(dim*dim*sizeof(double)); double *A_trans = malloc(dim*dim*sizeof(double)); double *b = malloc(total_equations*dim*sizeof(double)); double *b_trans = malloc(total_equations*dim*sizeof(double)); double *diagA = malloc(dim*sizeof(double)); double *reverse_diagA = malloc(dim*sizeof(double)); double *x_init = malloc(C*dim*sizeof(double)); double *x_trans_init = malloc(C*dim*sizeof(double)); double *result = malloc(total_equations * dim * sizeof(double)); double *reorder_result = malloc(total_equations * dim *sizeof(double)); double *solutions = malloc(total_equations * dim *sizeof(double)); double *error = malloc(total_equations*sizeof(double)); double *error_bak = malloc(total_equations*sizeof(double)); int *is_solution_valid = malloc(total_equations*sizeof(int)); int *recacu_error_index = malloc(total_equations*sizeof(int)); double *expected_error = malloc(total_equations*sizeof(double)); double *x_base = malloc(total_equations * dim * sizeof(double)); double *x_all_init = malloc(total_equations * dim *sizeof(double)); double *x_all_trans_init = malloc(total_equations * dim *sizeof(double)); memset(A, 0 , sizeof(double)*dim*dim); memset(A_trans, 0 , sizeof(double)*dim*dim); memset(b, 0 , sizeof(double)*dim*total_equations); memset(b_trans, 0 , sizeof(double)*dim*total_equations); memset(diagA, 0 , sizeof(double)*dim); memset(reverse_diagA, 0 , sizeof(double)*dim); memset(x_init, 0 , sizeof(double) *C*dim); memset(result, 0 , sizeof(double)*dim*total_equations); memset(reorder_result, 0 , sizeof(double)*dim*total_equations); memset(error, 0 , sizeof(double)*total_equations); memset(expected_error, 0 , sizeof(double)*total_equations); memset(x_base, 0 , sizeof(double)*dim*total_equations); memset(x_all_init, 0 , sizeof(double)*dim*total_equations); memset(x_all_trans_init, 0 , sizeof(double)*dim*total_equations); memset(is_solution_valid,0 , sizeof(int)*total_equations); for(int i = 0; i < total_equations; i ++) { recacu_error_index[i] = -1; expected_error[i] = 1000; error_bak[i] = 1000; for(int j = 0; j < dim; j ++) { solutions[i*dim + j] = 1000; } } /** * Generating random value for b and A */ srand(time(NULL)); for(int i = 0; i < dim; ++i) { double sum = 0; for(int j = 0; j < dim; ++j) { if(i != j) { A[i*dim+j] = 2.0*rand()/(double)RAND_MAX - 1 ; // random number between -1 and 1 sum += fabs(A[i*dim+j]) ; } } A[i * dim + i] = 1 + sum; diagA[i] = 1.0/A[i * dim + i]; reverse_diagA[i] = A[i * dim + i]; } double A_original[dim * dim]; for(int i = 0; i < C*blks; i ++) { for(int j = 0; j < dim; j ++) { b[i * dim + j] = 2.0*rand()/(double)RAND_MAX - 1; } } for(int i = 0; i < dim; i ++) { for(int j = 0; j < dim; j ++) { A_original[i * dim + j] = A[i * dim + j]; if(i != j) { A[i * dim + j] = A[i*dim + j] * diagA[i]; } } } /** * Reorder the input A and b */ engine_start = clock(); for(int i = 0; i < dim; i ++) { for(int j = 0; j < dim; j ++) { A_trans[i * dim + j] = A[j * dim + i]; } } int count = 0; for(int yy = 0; yy < total_equations; yy += C) { for(int i = 0; i < dim; i ++) { for(int j = yy; j <yy + C; j ++) { b_trans[count] = b[j * dim + i]*diagA[i]; count ++; } } } for(int k = 0; k < blks; k ++) { for ( int i = 0; i < C ; i ++ ) { for ( int j = 0; j < dim; j ++ ) { x_init[i * dim + j] = 0; x_trans_init[j*C + i] = x_init[i * dim + j]; } } memcpy(x_all_trans_init + k * C * dim , x_trans_init , sizeof(double)*C*dim); memcpy(x_all_init + k * C * dim , x_init , sizeof(double)*C*dim); } jacobi( dim, total_equations, MAX_ITER, A_trans , dim * dim * sizeof(double) , b_trans , total_equations * dim * sizeof(double) , reverse_diagA , dim * sizeof(double) , x_all_trans_init , total_equations * dim * sizeof(double) , error , total_equations * sizeof(double) , result , total_equations * dim * sizeof(double) ); for(int yy = 0; yy<total_equations; yy += C) { for(int i = 0; i < C; i ++) { for(int j = 0; j < dim; j ++) { reorder_result[yy *dim + i*dim + j] = result[yy * dim + i + j * C]; } } } /*Check Error to decide whether we need to restream into kernel again*/ int recacu_cnt = 0; int new_recacu_cnt = 0; int actual_recacu_cnt = 0; int new_actual_recacu_cnt = 0; double *x_latest_init = malloc(total_equations * dim * sizeof(double)) ; double *x_latest_trans_init = malloc(total_equations * dim * sizeof(double)) ; double *recacu_b = malloc(total_equations * dim * sizeof(double)) ; double *recacu_trans_b = malloc(total_equations * dim * sizeof(double)) ; memset(x_latest_init , 0 , total_equations * dim * sizeof(double)) ; memset(x_latest_trans_init , 0 , total_equations * dim * sizeof(double)) ; memset(recacu_b , 0 , total_equations * dim * sizeof(double)) ; memset(recacu_trans_b , 0 , total_equations * dim * sizeof(double)) ; int idx = 0; for(int i = 0; i < total_equations; i ++) { if(error[i] > CUR_EPS) { memcpy(x_latest_init + idx*dim, reorder_result + i*dim, dim*sizeof(double)); memcpy(recacu_b + idx*dim, b + i*dim, dim*sizeof(double)); recacu_error_index[idx] = i; recacu_cnt ++ ; actual_recacu_cnt ++ ; idx ++; } else { error_bak[i] = error[i]; memcpy(solutions + i*dim, reorder_result + i*dim, dim*sizeof(double)); } } while( recacu_cnt % C ) { recacu_cnt ++; } /** * if recaculate count not zero, we start to restream data into kernel again */ int times = 1; while( recacu_cnt != 0 ) { /*Reorder Latest solutions init value */ times ++; memset(x_latest_trans_init, 0, recacu_cnt*dim*sizeof(double)); count = 0; for(int yy = 0; yy < recacu_cnt; yy += C) { for(int i = 0; i < dim; i ++) { for(int j = yy; j < yy + C; j ++) { x_latest_trans_init[count] = x_latest_init[j * dim + i]; count ++; } } } /*Reorder latest b*/ memset(recacu_trans_b, 0, total_equations*dim*sizeof(double)); count = 0; for(int yy = 0; yy < recacu_cnt; yy += C) { for(int i = 0; i < dim; i ++) { for(int j = yy; j < yy + C; j ++) { recacu_trans_b[count] = recacu_b[j * dim + i]*diagA[i]; count ++; } } } memset(error , 0 , recacu_cnt * sizeof(double ) ) ; memset(result , 0 , recacu_cnt * dim * sizeof(double ) ) ; jacobi( dim, recacu_cnt, MAX_ITER, A_trans , dim * dim * sizeof(double) , recacu_trans_b , recacu_cnt * dim * sizeof(double) , reverse_diagA , dim * sizeof(double) , x_latest_trans_init , recacu_cnt * dim * sizeof(double) , error , recacu_cnt * sizeof(double) , result , recacu_cnt * dim * sizeof(double) ); for(int yy = 0; yy < recacu_cnt; yy += C) { for(int i = 0; i < C; i ++) { for(int j = 0; j < dim; j ++) { reorder_result[yy *dim + i*dim + j] = result[yy * dim + i + j * C]; } } } new_recacu_cnt = 0; new_actual_recacu_cnt = 0; int idx2 = 0; for(int i = 0; i < actual_recacu_cnt; i ++) { if(error[i] > CUR_EPS) { memcpy(x_latest_init + new_recacu_cnt*dim, reorder_result + i*dim, dim*sizeof(double)); memcpy(recacu_b + new_recacu_cnt*dim, recacu_b + i*dim, dim*sizeof(double)); recacu_error_index[idx2] = recacu_error_index[i]; new_recacu_cnt ++; new_actual_recacu_cnt ++; idx2 ++; } else { error_bak[ recacu_error_index[i]] = error[i]; memcpy(solutions + recacu_error_index[i] *dim, reorder_result + i*dim, dim*sizeof(double)); } } /* padding to multipy of C */ while( new_recacu_cnt % C ) { new_recacu_cnt ++; } /* update the current recaculating solution numbers */ recacu_cnt = new_recacu_cnt; actual_recacu_cnt = new_actual_recacu_cnt; }//loop while engine_end = clock(); engine_total_time = (double)(engine_end - engine_start) / CLOCKS_PER_SEC; fprintf(stderr, "=========>Kernel Complete, Stream Times: %d\n", times); clock_t cpu_start = clock(); jacobi_opt(A_original, x_base, b, dim, C, total_equations, x_all_init , expected_error); clock_t cpu_end = clock(); double cpu_total_time = (double)(cpu_end - cpu_start) / CLOCKS_PER_SEC; /* Compare the result with the standard result */ int cnt = 0; int index = 0; for(int i = 0; i < total_equations; i ++) { for(int j = 0; j < dim; j ++) { double diff = solutions[i * dim + j] - x_base[i*dim + j]; if(fabs(diff) > EPS) { fprintf(stderr, "error: atual=%.10f, expect=%.10f, err=%.10e\n", solutions[i * dim + j], x_base[i*dim + j], diff); cnt ++; index ++; } } } if(cnt == 0) { max_print_result(dim, total_equations, MAX_ITER, engine_total_time, cpu_total_time); fprintf(stderr, "==========>All Test Passed\n\n"); } else { fprintf(stderr, "!!!Test Failed:%d\n\n", cnt); } free ( A ) ; free ( A_trans ) ; free ( b ) ; free ( b_trans ) ; free ( diagA ) ; free ( reverse_diagA ) ; free ( x_init ) ; free ( error ) ; free ( error_bak ) ; free ( recacu_error_index ) ; free ( expected_error ) ; free ( result ) ; free ( reorder_result ) ; free ( solutions ) ; free ( x_base ) ; free ( x_all_init ) ; free ( x_all_trans_init ) ; free ( x_latest_init ) ; free ( x_latest_trans_init ) ; free ( recacu_b ) ; int status = (cnt == 0) ? 0:1; return status; }