int main(int argc, char **argv) { // Serial code int a=0; int local_sense=1; int i,j; if(argc!=3) { printf("Usage: spin_sense <Number-of-OpenMP-threads> <Num-of-Barriers>\n"); exit(-1); } num_threads=atoi( argv[1] ); num_barriers=atoi(argv[2]); if (num_barriers > 1000) num_barriers = 1000; count=num_threads; double start[num_barriers][num_threads],end[num_barriers][num_threads],barrier_time[num_barriers],maxstart[num_barriers],maxend[num_barriers], avg_time = 0.0f; for(i=0; i<num_barriers; i++) { maxstart[i]=0.0f; maxend[i]=0.0f; barrier_time[i]=0.0f; } if(DEBUG == 1) printf("This is the serial section\n"); omp_set_num_threads(num_threads); #pragma omp parallel shared(a) private(i) firstprivate(local_sense) { // Now we're in the parallel section int thread_num = omp_get_thread_num(); for (i=0; i<num_barriers; i++) { #pragma omp critical { a++; } if(DEBUG == 1) printf("a=%d in thread %d before barrier # %d.\n",a,thread_num,i+1); start[i][thread_num] = omp_get_wtime(); sense_barrier(&local_sense,thread_num); end[i][thread_num] = omp_get_wtime(); if(DEBUG == 1) printf("a=%d in thread %d after barrier # %d.\n",a,thread_num,i+1); } } for(i=0;i<num_barriers;i++) { for(j=0;j<num_threads;j++) { if(start[i][j]>maxstart[i]) maxstart[i]=start[i][j]; if(end[i][j]>maxend[i]) maxend[i]=end[i][j]; } barrier_time[i]=maxend[i]-maxstart[i]; if(DEBUG == 1) printf("Barrier Time of barrier #%d = %lf\n",i+1,barrier_time[i]); } for(i=0;i<num_barriers;i++) { avg_time += barrier_time[i]; } avg_time = avg_time/num_barriers; printf("Barrier_Time=%lf\n",avg_time); // Resume serial code if(DEBUG == 1) printf("Back in the serial section again\n"); return 0; }
int main(int argc, char const *argv[]) { char* s; std::srand(std::time(0)); //use current time as seed for random generator int r = rand() % 1000; for(int i = 0; i < r; i++) { rand(); } if(argc < 3) { return 1; } int forestSize = strtol(argv[1], &s, 10); int iterations = strtol(argv[2], &s, 10); double SIDE = std::sqrt(forestSize); SIDE = fRand(std::sqrt(SIDE),std::sqrt(2)*SIDE); double R = 1; double begin, end; std::vector<int> empty; std::vector<Tree*> Forest; std::vector< std::vector<int> > neighbors(forestSize,empty); std::vector<double> metrics(forestSize,0.0); int num_threads; std::vector<int> systems_processed; // DEBUG std::vector<int> symbols_translated; // DEBUG ///// PARALLEL BLOCK begin = omp_get_wtime(); #pragma omp parallel shared(Forest,neighbors,metrics,num_threads) { #pragma omp master { // INIT VARIABLES std::vector<Point> positions; num_threads = omp_get_num_threads(); std::cout << "Running " << forestSize << " trees for " << iterations << " iterations on " << num_threads << " processors" << std::endl; for(int i = 0; i < forestSize; i++) { double x = fRand(0,SIDE); double y = fRand(0,SIDE); Point p = {x,y}; Tree *T = new MonopodialTree(); Forest.push_back(T); positions.push_back(p); for(int j = 0 ; j < i ; j++) { Point q = positions[j]; if(pointDistance(p,q) < R) { neighbors[j].push_back(i); neighbors[i].push_back(j); } } } systems_processed = std::vector<int>(num_threads,0); //DEBUG symbols_translated = std::vector<int>(num_threads,0); //DEBUG } #pragma omp barrier int thread_num = omp_get_thread_num(); // ITERATE for(int j = 0 ; j < iterations ; j++) { #pragma omp for schedule(dynamic) for(int i = 0; i < Forest.size() ; i++) { Forest[i]->next(); double metric = Forest[i]->calculateMetric(); metrics[i] = metric; systems_processed[thread_num]++; //DEBUG symbols_translated[thread_num] += Forest[i]->getState().size(); //DEBUG } #pragma omp for schedule(dynamic) for(int i = 0; i < Forest.size() ; i++) { Forest[i]->updateMetric(metrics,neighbors[i]); } } } ///// PARALLEL BLOCK end = omp_get_wtime(); std::vector< std::vector<int> > connected_components = get_connected_components(neighbors); // print_forest(Forest, neighbors, metrics); // VERBOSE // print_connected_components( connected_components); // VERBOSE char buffer[80]; FILE *f = fopen("Results_naive.txt", "a"); if(f != NULL) { fprintf(f, "%s\n", gettime(buffer)); fprintf(f,"%d threads\n",num_threads); fprintf(f,"%d trees\n",forestSize); fprintf(f,"%d iterations\n",iterations); fprintf(f,"%lf %lf\n",SIDE,R); for(int i = 0; i < connected_components.size(); i++) { fprintf(f, "%d ", connected_components[i].size()); } fprintf(f, "\n"); fprintf(f,"Proc Systems Symbols\n");//DEBUG for(int i = 0; i < num_threads; i++)//DEBUG {//DEBUG fprintf(f," %02d %03d %03d\n",i,systems_processed[i],symbols_translated[i]);//DEBUG }//DEBUG fprintf(f,"Time : %f seconds\n", end-begin); fprintf(f,"\n=====================\n"); } for(int i = 0; i < Forest.size() ; i++) { delete Forest[i]; } return 0; }
int main ( int argc, char *argv[] ) /******************************************************************************/ /* */ { # define M 12 # define N 12 # define ITER 1000 int i, j, cur, temp_i, temp_j; double epsilon = 0.001; double mean = 0.0; double diff, my_diff; double u[M][N]; /* * Begin setup of the array. */ #pragma omp parallel shared( u ) private(i, j) reduction(+ : mean) { #pragma omp for for ( i = 1; i < M - 1; i++ ) { u[i][0] = 100.0; } #pragma omp for for ( i = 1; i < M - 1; i++ ) { u[i][N-1] = 100.0; } #pragma omp for for ( j = 0; j < N; j++ ) { u[M-1][j] = 100.0; } #pragma omp for for ( j = 0; j < N; j++ ) { u[0][j] = 0.0; } /* Average the boundary values, to come up with a reasonable initial value for the interior. */ #pragma omp for for ( i = 1; i < M - 1; i++ ) { mean = mean + u[i][0]; } #pragma omp for for ( i = 1; i < M - 1; i++ ) { mean = mean + u[i][N-1]; } #pragma omp for for ( j = 0; j < N; j++ ) { mean = mean + u[M-1][j]; } #pragma omp for for ( j = 0; j < N; j++ ) { mean = mean + u[0][j]; } } mean = mean / ( double ) ( 2 * M + 2 * N - 4 ); printf ( "\n" ); printf ( " MEAN = %f\n", mean ); /* Initialize the interior solution to the mean value. */ #pragma omp parallel shared( u ) private(i, j) { #pragma omp for for(i = 1; i < M -1; i++) { for(j = 1; j < N -1; j++) { u[i][j] = mean; } } } printf(" MEAN = %f\n", mean); /* * End array setup so at this point our array contains the values * that it starts with. */ diff = epsilon; int iteration_number = 0; int run = 1; double wtime = omp_get_wtime(); while(run) { int cont = 0; my_diff = 0.0; printf("Currently running on iteration number %d with diff %f\n", iteration_number, diff); diff = 0.0; iteration_number++; #pragma omp parallel shared(u, diff) private(i, j, cur, mean, temp_i, temp_j) reduction(+ : cont) { srand((int)time(NULL) ^ omp_get_thread_num()); for(i = 1; i < M-1; i++) { #pragma omp for for(j = 1; j < N-1; j++) { mean = 0.0; for(cur = 0; cur < ITER; cur++) { temp_i = i; temp_j = j; while(1) { int direction = rand()%4; //Go towards the i = 0 row if(direction == 0) { temp_i--; if(temp_i == 0){mean += 0.0; break;} } //Go towards the j = 0 col else if(direction == 1) { temp_j--; if(temp_j == 0){mean += 100.0; break;} } //Go towards the i = M row else if(direction == 2) { temp_i++; if(temp_i == (M-1)){mean += 100.0; break;} } //Go towards the j = N col else { temp_j++; if(temp_j == (N-1)){mean += 100.0; break;} } } } double old = u[i][j]; if(iteration_number == 0) { u[i][j] = (double) (u[i][j] + mean)/(ITER + 1); } else { double cur_iter = (double) iteration_number * ITER; double prev_avg = (double) cur_iter * u[i][j]; u[i][j] = (double) (prev_avg + mean) / (cur_iter + ITER); } if( fabs(old - u[i][j]) > epsilon) { if( fabs(old - u[i][j]) > my_diff) { my_diff = fabs(old - u[i][j]); } cont++; } } } #pragma omp critical { if(my_diff > diff){diff = my_diff;} } } if(cont == 0){run = 0;} } wtime = omp_get_wtime() - wtime; printf("Time taken %f\n", wtime); return 0; # undef M # undef N }
int main(int argc, char** argv) { int i, j; double t1, t2, total; //Leer argumento de entrada (no de componentes del vector) if (argc<2){ printf("Falta tamaño de matriz y vector\n"); exit(-1); } unsigned int N = atoi(argv[1]); // Máximo N =2^32-1=4294967295 (sizeof(unsigned int) = 4 B) double *v1, *v2, **M; v1 = (double*) malloc(N*sizeof(double));// malloc necesita el tamaño en bytes v2 = (double*) malloc(N*sizeof(double)); //si no hay espacio suficiente malloc devuelve NULL M = (double**) malloc(N*sizeof(double *)); if ( (v1==NULL) || (v2==NULL) || (M==NULL) ){ printf("Error en la reserva de espacio para los vectores\n"); exit(-2); } for (i=0; i<N; i++){ M[i] = (double*) malloc(N*sizeof(double)); if ( M[i]==NULL ){ printf("Error en la reserva de espacio para los vectores\n"); exit(-2); } } //A partir de aqui se pueden acceder las componentes de la matriz como M[i][j] //Inicializar matriz y vectores #pragma omp parallel { #pragma omp for private(j) for (i=0; i<N;i++) { v1[i] = i; v2[i] = 0; for(j=0;j<N;j++) M[i][j] = i+j; } //Medida de tiempo #pragma omp single t1 = omp_get_wtime(); //Calcular producto de matriz por vector v2 = M · v1 #pragma omp for private(j) for (i=0; i<N;i++) for(j=0;j<N;j++) v2[i] += M[i][j] * v1[j]; //Medida de tiempo #pragma omp single t2 = omp_get_wtime(); } total = t2 - t1; //Imprimir el resultado y el tiempo de ejecución printf("Tiempo(seg.):%11.9f\t / Tamaño:%u\t/ V2[0]=%8.6f V2[%d]=%8.6f\n", total,N,v2[0],N-1,v2[N-1]); // Imprimir todos los componentes de v2 (solo si es razonable el tamaño) if (N<20) for (i=0; i<N;i++) printf(" V2[%d]=%5.2f\n", i, v2[i]); free(v1); // libera el espacio reservado para v1 free(v2); // libera el espacio reservado para v2 for (i=0; i<N; i++) free(M[i]); free(M); return 0; }
int main(int argc, char* argv[]) { double before, time1, time2; int M = MM; int N = NN; int P = PP; if (argc != 4) { printf("Suggested Usage: %s <M> <N> <P> \n", argv[0]); printf("Using default values\n"); } else { M = atoi(argv[1]); N = atoi(argv[2]); P = atoi(argv[3]); } double **A = Allocate2DArray< double >(M, P); double **B = Allocate2DArray< double >(P, N); double **C = Allocate2DArray< double >(M, N); double **C4 = Allocate2DArray< double >(M, N); int i, j; for (i = 0; i < M; ++i) { for (j = 0; j < P; ++j) { A[i][j] = 5.0 - ((double)(rand()%100) / 10.0); } } for (i = 0; i < P; ++i) { for (j = 0; j < N; ++j) { B[i][j] = 5.0 - ((double)(rand()%100) / 10.0); } } for (i = 0; i < M; ++i) { for (j = 0; j < N; ++j) { C[i][j] = 0.0; C4[i][j] = 0.0; } } printf("Execute Standard matmult M = %d N = %d P = %d\n\n", M, N, P); before = omp_get_wtime(); seqMatMult(M, N, P, A, B, C); time1 = omp_get_wtime() - before; printf("Standard matrix function done in %7.2f secs\n\n\n",(float)time1); before = omp_get_wtime(); matmultS(M, N, P, A, B, C4); time2 = omp_get_wtime() - before; printf("Strassen matrix function done in %7.2f secs\n\n\n",time2); printf("Checking..."); if (CheckResults(M, N, C, C4)) printf("Error in Strassen Matrix Multiplication\n\n"); else { printf("OKAY\n\n"); printf("Speedup = %5.1fX\n", time1/time2); } Free2DArray< double >(A); Free2DArray< double >(B); Free2DArray< double >(C); Free2DArray< double >(C4); return 0; }
int main(int argc, char **argv) { size_t size; fftwf_complex *data; fftwf_plan plan; if(argc >= 2) { size = atoi(argv[1]); if (size <= 0) { fprintf(stderr, "ERROR, matrix size <= 0 !\n"); return EXIT_FAILURE; } } else { fprintf(stderr, "ERROR, pass matrix size as 1st parameter !\n"); return EXIT_FAILURE; } const size_t N = size * size * size; data = (fftwf_complex*)_mm_malloc(sizeof(fftw_complex) * N, 64); if (data == NULL) { fprintf(stderr, "ERROR, _mm_malloc() !\n"); return EXIT_FAILURE; } PapiCounterList papi_routines; papi_routines.AddRoutine("fftw"); // NUMA First touch #pragma omp parallel for for (size_t i = 0; i < N; ++i) data[i][0] = data[i][1] = 1.0; fprintf(stdout, "** FFTW 3D OMP **\n"); fprintf(stdout, "* OMP_NUM_THREADS: %d\n", omp_get_max_threads()); fprintf(stdout, "* Size of Matrix: %dx%dx%d\n", (int)size, (int)size, (int)size); // fftw threads plan fftwf_plan_with_nthreads(omp_get_max_threads()); // fftw compute plan plan = fftwf_plan_dft_3d(size, size, size, data, data, FFTW_FORWARD, FFTW_MEASURE); papi_routines["fftw"].Start(); // compute results const double tstart = omp_get_wtime(); fftwf_execute(plan); const double tend = omp_get_wtime(); papi_routines["fftw"].Stop(); printf("* Wall time: %fs\n\n", tend - tstart); papi_routines.PrintScreen(); // free memory _mm_free(data); fftwf_destroy_plan(plan); return EXIT_SUCCESS; }
void facets_conform_dynamic_remove(data_list * data, ptriangulation triang, int iterations, tri_list * check_list, tri_list * check_list_new, omp_lock_t ** locks) { int dim = data_list_dim(data); //tri_mem_list * list = &data->mem_list; cube_points cube = gen_cube_points(dim); //Initalize the parameter. Every thread should have it's own copy static facet_acute_data parameters; #pragma omp threadprivate(parameters) #pragma omp parallel { parameters.cube = &cube; parameters.boundary_func = &triangle_boundary_cube; parameters.data = data; parameters.store_acute_ind = 0; parameters.acute_ind = malloc(sizeof(vert_index) * cube.len); } int iter = 0; double time_start, time_check; size_t count = 0; int triang_consistent = 1; while (tri_list_count(check_list) && triang_consistent && (iter != iterations)) //While we have triangles to be removed) { time_start = omp_get_wtime(); triangle cur_tri; tri_index cur_idx; int l,k; size_t i,j; size_t facets_add_total = 0; #pragma omp parallel shared(locks) private(cur_tri, cur_idx, i,j,k,l) { facets_add_cnt = 0; if (omp_get_thread_num() == 0) { size_t new_count = data_list_count(data); if (count) printf("Removed %zu triangles\n", count - new_count); printf("\n\nLoop %d of conform dynamic\n", iter++); printf("Size of entire list %zu\n", new_count); printf("Size of check list %zu\n", tri_list_count(check_list)); tri_list_validate(check_list); count = new_count; } /* * Loop over all the triangles in the check list. Check if they are not conform * if so, add all the possible new non-conform edges to the tmp_check_list. */ #pragma omp for schedule(dynamic,dim) for (i = 0; i < cube.len; i++) { if (!triang_consistent) continue; //We want break, but that is not possible with openMP for (j = i; j < cube.len; j++) { if (!triang_consistent) break; for (l = check_list->t_arr[i][j- i].len - 1; l >= 0; l--) { //Loop over all triangles (i,j,*) k = check_list->t_arr[i][j - i].p_arr[l] + j; cur_idx[0] = i; cur_idx[1] = j; cur_idx[2] = k; cur_tri = triangle_from_index_cube(cur_idx, dim); //Cur_tri now holds the triangle we should check if (!data_list_contains(data, &cur_tri)) continue;//This triangle was already removed.. Skip :-) if (!facet_conform(&cur_tri, ¶meters)) { //This triangle is not conform, delete! parameters.store_acute_ind = 1; facet_conform(&cur_tri, ¶meters); parameters.store_acute_ind = 0; //Add all the sides of conform tetrahedrons with cur_tri as base to the possible non-conform list. facets_tetra_list(check_list_new, cur_idx, parameters.acute_ind, parameters.acute_ind_len, locks); //Cur_tri is not conform, remove from the data structure. if (data->mode == DATA_MEM_LIST_CUBE) mem_list_cube_clear(&data->mem_list, &cur_tri); else tri_list_remove(&data->list, &cur_tri, TRI_LIST_NO_RESIZE); } } } if (omp_get_thread_num() == 0) triang_consistent = triangulation_consistent(triang, ¶meters); } #pragma omp atomic facets_add_total += facets_add_cnt; } if (triang_consistent) { printf("Amount of triangles in new_check_list: +/- %zu\n", facets_add_total); printf("Amount of triangles in new_check_list: exact %zu\n", tri_list_count(check_list_new)); //Checked all the triangles from check_list. Empty it and swap the lists. tri_list_empty(check_list); if (iter != iterations) { tri_list tmp = *check_list; *check_list = *check_list_new; *check_list_new = tmp; } } else printf("Triangulation not consistent anymore\n"); time_check = omp_get_wtime(); printf("\nTook %f seconds to construct new check list\n",time_check - time_start); } free(cube.points); #pragma omp parallel { free(parameters.acute_ind); } }
static void solve(double* density, double& time) { PREV_DENSITY = new double[XY_LEN]; for (int j = 0; j < OY_LEN + 1; j++) { for (int i = 0; i < OX_LEN_1; i++) { PREV_DENSITY[OX_LEN_1 * j + i] = analytical_solution(0, OX[i], OY[j]); } } int i = 0, j = 0, tl = 0; double timeStart = 0, timeEnd=0; #ifdef _OPENMP // printf("OPENMP THREADS COUNT = %d\n", omp_get_max_threads()); long count = 0; // dummy parallel section to get all threads running #pragma omp parallel private(i,j) { _InterlockedIncrement(&count); } #endif #ifdef _OPENMP // printf("OPENMP timer function is used!\n"); timeStart = omp_get_wtime(); #else // printf("Standart timer function is used!\n"); StartTimer(); #endif fflush(stdout); for (tl = 1; tl <= TIME_STEP_CNT; tl++) { PREV_TIME = TIME; TIME = TAU * tl; for (int k = 0; k <= OX_LEN; k++) { density[k] = analytical_solution(OX[k], BB, TIME); density[OX_LEN_1 * OY_LEN + k] = analytical_solution(OX[k], UB, TIME); } for (int u = 0; u <= OY_LEN; u++) { density[OX_LEN_1 * u] = analytical_solution(LB, OY[u], TIME); density[OX_LEN_1 * u + OX_LEN] = analytical_solution(RB, OY[u], TIME); } #ifdef _OPENMP #pragma omp parallel for collapse(2) private(i, j) #endif for (j = 1; j < OY_LEN; ++j) { for (i = 1; i < OX_LEN; ++i) { density[OX_LEN_1 * j + i] = integrate(i, j); density[OX_LEN_1 * j + i] += TAU * func_f(B, TIME, UB, BB, LB, RB, OX[i], OY[j]); } } memcpy(PREV_DENSITY, density, XY_LEN * sizeof(double));// заменить на быструю версию из agnerasmlib } #ifdef _OPENMP timeEnd = omp_get_wtime(); time = (timeEnd-timeStart); // printf("time %f s.\n", time); #else time = GetTimer()/1000; // printf("time %f s.\n", time/1000); #endif delete [] PREV_DENSITY; }
int main ( int argc, char *argv[] ) /******************************************************************************/ /* Purpose: HELLO has each thread print out its ID. Discussion: HELLO is a "Hello, World" program for OpenMP. Licensing: This code is distributed under the GNU LGPL license. Modified: 23 June 2010 Author: John Burkardt */ { int id; double wtime; printf ( "\n" ); printf ( "HELLO_OPENMP\n" ); printf ( " C/OpenMP version\n" ); printf ( "\n" ); printf ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); printf ( " Number of threads = %d\n", omp_get_max_threads ( ) ); wtime = omp_get_wtime ( ); printf ( "\n" ); printf ( " OUTSIDE the parallel region.\n" ); printf ( "\n" ); id = omp_get_thread_num ( ); printf ( " HELLO from process %d\n", id ) ; printf ( "\n" ); printf ( " Going INSIDE the parallel region:\n" ); printf ( "\n" ); /* INSIDE THE PARALLEL REGION, have each thread say hello. */ # pragma omp parallel \ private ( id ) { id = omp_get_thread_num ( ); printf (" Hello from process %d\n", id ); } /* Finish up by measuring the elapsed time. */ wtime = omp_get_wtime ( ) - wtime; printf ( "\n" ); printf ( " Back OUTSIDE the parallel region.\n" ); /* Terminate. */ printf ( "\n" ); printf ( "HELLO_OPENMP\n" ); printf ( " Normal end of execution.\n" ); printf ( "\n" ); printf ( " Elapsed wall clock time = %f\n", wtime ); return 0; }
int main() { SetThreads(); PrintInfo(); double Start = omp_get_wtime(); double * restrict ResultPrices; ResultPrices = malloc(sizeof(double) * HISTORY); #pragma offload target(mic) out(ResultPrices:length(HISTORY)) { SetMICThreads(); double * restrict Prices; double * restrict Epsilon; Prices = malloc(sizeof(double) * HISTORY); Epsilon = malloc(sizeof(double) * HISTORY); //Creating random stream VSLStreamStatePtr RndStream; vslNewStream(&RndStream, VSL_BRNG_SFMT19937, (int)time(NULL)); long double Buff; for (unsigned int iter = 0; iter < TE; iter++) { //Randomize volumes vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, RndStream, HISTORY, Epsilon, 0, 0.002); #pragma omp parallel for shared(Prices, ResultPrices) for (unsigned long long int i = 0; i < HISTORY; i++) { //Buff = i * i * powl(10, (-21.65) - i * 4.5 * powl(10, (-10.65)); //Prices[i] = (((i * i * powl(10, (-24.65))) - (i * 4.5 * powl(10, (-13.65))) + 1.095) + Epsilon[i]); Prices[i] = ( ( i * i * powl(10, (-24.65)) - i * 4.5 * powl(10, (-13.65)) + 1.095 ) + Epsilon[i]); ResultPrices[i] += Prices[i]; } } #pragma omp parallel for shared(ResultPrices) for (unsigned long long int j = 0; j < HISTORY; j++) { ResultPrices[j] = ResultPrices[j] / TE;; } free(Prices); free(Epsilon); Prices = NULL; Epsilon = NULL; } double End = omp_get_wtime(); printf("%lf\n", (End - Start)); FILE *FpResultHistory; //unsigned long long int Buff; FpResultHistory = fopen("res_history.txt", "wb"); if (FpResultHistory) { printf("//================================================================\n"); printf("|| Result history file status : open\n"); for (unsigned long long int i = 0; i < HISTORY; i++) { //Buff = (i); fprintf(FpResultHistory, "%llu %lf\n", (i * 10), ResultPrices[i]); //fprintf(fp_result, "%lf %lf %lf\n", ResultPrices[i], ResultVolumeUp[i], ResultVolumeDown[i]); } fclose(FpResultHistory); printf("|| Result history file status : close\n||\n"); printf("\\================================================================\n\n"); } free(ResultPrices); ResultPrices = NULL; return 0; }
std::vector<GraspHypothesis> HandSearch::findHands(const PointCloud::Ptr cloud, const Eigen::VectorXi& pts_cam_source, const std::vector<Quadric>& quadric_list, const Eigen::VectorXi& hands_cam_source, const pcl::KdTreeFLANN<pcl::PointXYZ>& kdtree) { double t1 = omp_get_wtime(); std::vector<int> nn_indices; std::vector<float> nn_dists; Eigen::Matrix3Xd nn_normals(3, nn_indices.size()); Eigen::VectorXi nn_cam_source(nn_indices.size()); Eigen::Matrix3Xd centered_neighborhood(3, nn_indices.size()); std::vector<RotatingHand> hand_list(quadric_list.size()); // std::vector<RotatingHand> hand_list; double time_eval_hand = 0.0; double time_iter = 0.0; double time_nn = 0.0; double time_tf = 0.0; std::vector< std::vector<GraspHypothesis> > grasp_lists(quadric_list.size(), std::vector<GraspHypothesis>(0)); #ifdef _OPENMP // parallelization using OpenMP #pragma omp parallel for private(nn_indices, nn_dists, nn_normals, nn_cam_source, centered_neighborhood) num_threads(num_threads_) #endif for (std::size_t i = 0; i < quadric_list.size(); i++) { double timei = omp_get_wtime(); pcl::PointXYZ sample; sample.x = quadric_list[i].getSample()(0); sample.y = quadric_list[i].getSample()(1); sample.z = quadric_list[i].getSample()(2); // std::cout << "i: " << i << ", sample: " << sample << std::endl; if (kdtree.radiusSearch(sample, nn_radius_hands_, nn_indices, nn_dists) > 0) { time_nn += omp_get_wtime() - timei; nn_normals.setZero(3, nn_indices.size()); nn_cam_source.setZero(nn_indices.size()); centered_neighborhood.setZero(3, nn_indices.size()); for (int j = 0; j < nn_indices.size(); j++) { nn_cam_source(j) = pts_cam_source(nn_indices[j]); centered_neighborhood.col(j) = (cloud->points[nn_indices[j]].getVector3fMap() - sample.getVector3fMap()).cast<double>(); nn_normals.col(j) = cloud_normals_.col(nn_indices[j]); } FingerHand finger_hand(finger_width_, hand_outer_diameter_, hand_depth_); Eigen::Vector3d sample_eig = sample.getVector3fMap().cast<double>(); RotatingHand rotating_hand(cam_tf_left_.block<3, 1>(0, 3) - sample_eig, cam_tf_right_.block<3, 1>(0, 3) - sample_eig, finger_hand, tolerant_antipodal_, hands_cam_source(i)); const Quadric& q = quadric_list[i]; double time_tf1 = omp_get_wtime(); rotating_hand.transformPoints(centered_neighborhood, q.getNormal(), q.getCurvatureAxis(), nn_normals, nn_cam_source, hand_height_); time_tf += omp_get_wtime() - time_tf1; double time_eval1 = omp_get_wtime(); std::vector<GraspHypothesis> grasps = rotating_hand.evaluateHand(init_bite_, sample_eig, true); time_eval_hand += omp_get_wtime() - time_eval1; if (grasps.size() > 0) { // grasp_list.insert(grasp_list.end(), grasps.begin(), grasps.end()); grasp_lists[i] = grasps; } } time_iter += omp_get_wtime() - timei; } time_eval_hand /= quadric_list.size(); time_nn /= quadric_list.size(); time_iter /= quadric_list.size(); time_tf /= quadric_list.size(); //std::cout << " avg time for transforming point neighborhood: " << time_tf << " sec.\n"; //std::cout << " avg time for NN search: " << time_nn << " sec.\n"; //std::cout << " avg time for rotating_hand.evaluate(): " << time_eval_hand << " sec.\n"; //std::cout << " avg time per iteration: " << time_iter << " sec.\n"; std::vector<GraspHypothesis> grasp_list; for (std::size_t i = 0; i < grasp_lists.size(); i++) { // std::cout << i << " " << grasp_lists[i].size() << "\n"; if (grasp_lists[i].size() > 0) grasp_list.insert(grasp_list.end(), grasp_lists[i].begin(), grasp_lists[i].end()); } double t2 = omp_get_wtime(); //std::cout << " Found " << grasp_list.size() << " robot hand poses in " << t2 - t1 << " sec.\n"; return grasp_list; }
/** call: ./main <matrix_dimension> <number_of_tests> <use_gpu>*/ int main(int argc, char* argv[]) { cuda_identify(); if (argc != 4) { printf("program must be called with arguments: matrix_dimension tests_number use_gpu(0/1)\n"); exit(1); } const int M = atoi(argv[1]); printf("Using matrix dimension: %d\n", M); const int tests = atoi(argv[2]); const bool cpu = !atoi(argv[3]); // always use the same seed to get the same matrices during tests srand(0); #ifdef DOUBLE const fp_t min_diff = 0.00000001; //for double, fails with 8192 and floats on both cpu and gpu #else const fp_t min_diff = 0.000001; #endif const fp_t alpha = 0.9; const int max_iter = 50; fp_t* exec_times = malloc(tests * sizeof(fp_t)); fp_t* all_rmse = malloc(tests * sizeof(fp_t)); for (int k = 0; k < tests; k++) { const DataSet dataset = generate_dataset(M); Matrix* last_x = aligned_vector(M, true); Matrix* x = aligned_vector(M, true); for (int i = 0; i < M; i++) { } int iterations = 0; // solve Ax = b const fp_t start_time = omp_get_wtime(); fp_t sum = 0; int j = 0; int i = 0; const Matrix* A = dataset.A; const Matrix* b = dataset.b; assert(x != last_x); if (cpu) { //#pragma omp parallel shared(last_x, x, iterations) private(i, j, sum) while ((matrix_diff(x, last_x) > min_diff) && (max_iter < 0 || iterations < max_iter)) { //fp_t st_time0 = omp_get_wtime(); //#pragma omp single { swap(last_x, x); } // A, M, alpha and b are constant, so they cannot be declared as shared //#pragma omp for schedule(dynamic) for (i = 0; i < M; i++) { sum = 0; //#pragma omp simd aligned(A, last_x: 16) reduction(+:sum) linear(j) for (j = 0; j < M; j++) { sum += A->elements[i * M + j] * last_x->elements[j]; } sum -= A->elements[i * M + i] * last_x->elements[i]; // opt: outside the loop for sse optimizer x->elements[i] = (1 - alpha) * last_x->elements[i] + alpha * (b->elements[i] - sum) / A->elements[i * M + i]; } //#pragma omp single nowait { iterations++; } //printf("%dus spent\n", (int)((omp_get_wtime() - st_time0) * 1000000)); } } else { Matrix* d_A = device_matrix_from(A); #ifndef DOUBLE #ifdef TEXTURE texbind(d_A->elements, d_A->size * sizeof(fp_t)); #endif #endif cudaMemcpy(d_A->elements, A->elements, A->size * sizeof(fp_t), cudaMemcpyHostToDevice); Matrix* d_b = device_matrix_from(b); cudaMemcpy(d_b->elements, b->elements, b->size * sizeof(fp_t), cudaMemcpyHostToDevice); Matrix* d_last_x = device_matrix_from(last_x); Matrix* d_c = device_matrix_from(b); Matrix* d_x = device_matrix_from(x); cudaMemcpy(d_x->elements, x->elements, x->size * sizeof(fp_t), cudaMemcpyHostToDevice); cudaMemcpy(d_last_x->elements, last_x->elements, last_x->size * sizeof(fp_t), cudaMemcpyHostToDevice); fp_t x_diff = 2 * min_diff; fp_t* d_x_diff; cudaMalloc((void**)&d_x_diff, sizeof(fp_t)); //fp_t stime; while ((x_diff > min_diff) && (max_iter < 0 || iterations < max_iter)) { //stime = omp_get_wtime(); cuda_multiply(*d_A, *d_last_x, *d_c); //print_cuda_elapsed(stime); //stime = omp_get_wtime(); cuda_reduce(*d_A, *d_b, *d_c, d_x, d_last_x, alpha); //performs swap //print_cuda_elapsed(stime); //stime = omp_get_wtime(); cuda_diff(*d_x, *d_last_x, d_x_diff); //print_cuda_elapsed(stime); iterations++; //cudaMemcpyFromSymbol(&x_diff, "d_x_diff", sizeof(x_diff), 0, cudaMemcpyDeviceToHost); //stime = omp_get_wtime(); cudaMemcpy(&x_diff, d_x_diff, sizeof(fp_t), cudaMemcpyDeviceToHost); //print_cuda_elapsed(stime); } // copy last_x instead, as it was swapped cudaMemcpy(x->elements, d_last_x->elements, x->size * sizeof(fp_t), cudaMemcpyDeviceToHost); #ifndef DOUBLE #ifdef TEXTURE texunbind(); #endif #endif cudaFree(d_A->elements); cudaFree(d_b->elements); cudaFree(d_last_x->elements); cudaFree(d_c->elements); cudaFree(d_x->elements); cudaFree(d_x_diff); free(d_A); free(d_b); free(d_c); free(d_last_x); free(d_x); } const fp_t end_time = omp_get_wtime(); const fp_t seconds_spent = end_time - start_time; exec_times[k] = seconds_spent; if (verbose) { printf("x: "); print_matrix(x); printf("expected_x: "); print_matrix(dataset.x); //print_matrix(dataset.A); //print_matrix(dataset.b); } Matrix* bx = aligned_vector(M, false); for (int i = 0; i < M; i++) { for (int j = 0; j < M; j++) { bx->elements[i] += A->elements[i * M + j] * x->elements[j]; } } if (verbose) { printf("resulting b: "); print_matrix(bx); } all_rmse[k] = rmse(bx, b); printf("RMSE: %0.10f\n", all_rmse[k]); printf("iterations: %d\nseconds: %0.10f\n", iterations, seconds_spent); assert(x != last_x); free(bx->elements); free(x->elements); free(last_x->elements); free(dataset.x->elements); free(dataset.A->elements); free(dataset.b->elements); free(bx); free(x); free(last_x); free(dataset.x); free(dataset.A); free(dataset.b); } printf("Time: mean %0.10f std %0.10f\n", array_mean(exec_times, tests), array_std(exec_times, tests)); printf("RMSE: mean %0.10f std %0.10f\n", array_mean(all_rmse, tests), array_std(all_rmse, tests)); free(all_rmse); free(exec_times); return 0; }
void print_cuda_elapsed(fp_t start_time) { cudaDeviceSynchronize(); printf("%dms spent\n", (int)((omp_get_wtime() - start_time) * 1000)); }
void SilhouetteExtractor::computeVisibleFrontFacingStatus() { int terrain_width = terrain_->width(); int terrain_height = terrain_->height(); delete front_facing_; front_facing_ = new FacingMode [(terrain_width-1)*(terrain_height-1)]; bool use_intersections = true ; if (!use_intersections) { setupPixelBuffer(); pixelbuffer_->makeCurrent(); glEnable(GL_DEPTH_TEST); glEnable(GL_LIGHT0); glEnable(GL_LIGHTING); GLfloat lightpos[] = {.5, 1., 1., 0.}; glLightfv(GL_LIGHT0, GL_POSITION, lightpos); glClearColor(1.0f, 1.0f, 1.0f, 1.0f); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); drawTerrain(); updateMatrices(); saveBuffer("test_dpth.png"); pixelbuffer_->makeCurrent(); } double begin = omp_get_wtime(); int i = 0; #pragma omp parallel for private(i) for (int j = 0; j < terrain_->height()-1; ++j) for (i = 0; i < terrain_->width()-1; ++i) { front_facing_[j*(terrain_width-1)+i] = kInvisible; Eigen::Vector3f center = getFaceCentroid(i, j); Eigen::Vector3f projector = center - camera_info_.position; //projector = camera_info_.direction; float theta = acos(camera_info_.direction.normalized().dot(projector.normalized())); if (theta > camera_info_.fov_in_rads/2) continue; front_facing_[j*(terrain_width-1)+i] = kBackFacing; if (terrain_->getGridNormal(i, j).dot(projector) <= -FLT_EPSILON) { if (use_intersections) { if (checkVisibility(center)) front_facing_[j*(terrain_width-1)+i] = kFrontFacing; } else { Eigen::Vector3d window_coords; gluProject(center[0], center[1], center[2], modelview_matrix_, projection_matrix_, viewport_, &window_coords[0], &window_coords[1], &window_coords[2]); if (window_coords[0] < 0 || window_coords[1] < 0 || window_coords[0] >= width() || window_coords[1] >= height()) continue; float depth = 0.0; glReadPixels(window_coords[0], window_coords[1], 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT, &depth); if (std::abs(depth-window_coords[2]) < 1e-3) front_facing_[j*(terrain_width-1)+i] = kFrontFacing; } } } double end = omp_get_wtime(); double elapsed_secs = double(end - begin); fprintf(stdout, "Elapsed time for checking front/back facing: %.2f secs\n", elapsed_secs); fprintf(stdout, "Num of threads: %d threads\n", omp_get_thread_num()); fflush(stdout); if (pixelbuffer_) { pixelbuffer_->doneCurrent(); cleanupPixelBuffer(); } }
int main(int argc, char **argv) { int Ndim; // A[Ndim][Ndim] int i,j, iters; double start_time, elapsed_time; TYPE conv, tmp, err, chksum; TYPE *A, *b, *x1, *x2, *xnew, *xold, *xtmp; // set matrix dimensions and allocate memory for matrices if(argc ==2){ Ndim = atoi(argv[1]); } else{ Ndim = DEF_SIZE; } printf(" jacobi solver parallel for version: ndim = %d\n",Ndim); A = (TYPE *) malloc(Ndim*Ndim*sizeof(TYPE)); b = (TYPE *) malloc(Ndim*sizeof(TYPE)); x1 = (TYPE *) malloc(Ndim*sizeof(TYPE)); x2 = (TYPE *) malloc(Ndim*sizeof(TYPE)); if (!A || !b || !x1 || !x2) { printf("\n memory allocation error\n"); exit(-1); } // generate our diagonally dominant matrix, A init_diag_dom_near_identity_matrix(Ndim, A); #ifdef VERBOSE mm_print(Ndim, Ndim, A); #endif // // Initialize x and just give b some non-zero random values // for(i=0; i<Ndim; i++){ x1[i] = (TYPE)0.0; x2[i] = (TYPE)0.0; b[i] = (TYPE)(rand()%51)/100.0; } start_time = omp_get_wtime(); // // jacobi iterative solver // conv = LARGE; iters = 0; xnew = x1; xold = x2; { // note: i am comparing against the convergence sqaured. This saves a // sqrt and an extra barrier. while((conv > TOLERANCE*TOLERANCE) && (iters<MAX_ITERS)) { { iters++; conv = 0.0; xtmp = xnew; // don't copy arrays. xnew = xold; // just swap pointers. xold = xtmp; } #pragma omp parallel for private(i,j) for (i=0; i<Ndim; i++){ xnew[i] = (TYPE) 0.0; for (j=0; j<Ndim;j++){ // if(i!=j) // xnew[i]+= A[i*Ndim + j]*xold[j]; xnew[i]+= A[i*Ndim + j]*xold[j] * (i != j); } xnew[i] = (b[i]-xnew[i])/A[i*Ndim+i]; } // // test convergence // #pragma omp parallel for private(tmp) reduction(+:conv) for (i=0; i<Ndim; i++){ tmp = xnew[i]-xold[i]; conv += tmp*tmp; } #ifdef DEBUG printf(" conv = %f \n",(float)conv); #endif } } conv = sqrt((double)conv); elapsed_time = omp_get_wtime() - start_time; printf(" Convergence = %g with %d iterations and %f seconds\n", (float)conv, iters, (float)elapsed_time); // // test answer by multiplying my computed value of x by // the input A matrix and comparing the result with the // input b vector. // err = (TYPE) 0.0; chksum = (TYPE) 0.0; for(i=0;i<Ndim;i++){ xold[i] = (TYPE) 0.0; for(j=0; j<Ndim; j++) xold[i] += A[i*Ndim+j]*xnew[j]; tmp = xold[i] - b[i]; #ifdef DEBUG printf(" i=%d, diff = %f, computed b = %f, input b= %f \n", i, (float)tmp, (float)xold[i], (float)b[i]); #endif chksum += xnew[i]; err += tmp*tmp; } err = sqrt((double)err); printf("jacobi solver: err = %f, solution checksum = %f \n", (float)sqrt(err), (float)chksum); free(A); free(b); free(x1); free(x2); }
int main() { /* Creat the file to save results */ char *varnames[NUM_VARS] = {"x_rec_all"}; create_netcdf(FILENAME_WR, NUM_VARS, varnames); /* Allocate memory */ double *x_fusion_lf_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double)); double *x_fusion_hf_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double)); double *x_rec_all = (double*)malloc(NUM_3DSNAPS * NUM_2DSNAPS * N_HR * N_HR * sizeof(double)); /* read all snapshots */ size_t start_ids[4] = {0, 0, 0, 0}; size_t count_ids[4] = {NUM_3DSNAPS, NUM_2DSNAPS, N_HR, N_HR }; read_netcdf(FILENAME_RD, "Uinterp_all", start_ids, count_ids, x_fusion_lf_all); read_netcdf(FILENAME_RD, "Udiff_all", start_ids, count_ids, x_fusion_hf_all); double time_all_start = omp_get_wtime(); double *x_current_lf = (double*)malloc(N_HR * N_HR * sizeof(double)); double *x_current_hf = (double*)malloc(N_HR * N_HR * sizeof(double)); double *x_rec = (double*)malloc(N_HR * N_HR * sizeof(double)); long int grid_size = N_HR * N_HR * NEIGHBOR_FULLSIZE * NEIGHBOR_FULLSIZE * SIM_FULLSIZE * SIM_FULLSIZE; int *gridpatches_y = (int*)malloc(grid_size * sizeof(int)); int *gridpatches_z = (int*)malloc(grid_size * sizeof(int)); int *acc_ids = (int*)malloc(ACC_FULLSIZE * ACC_FULLSIZE * sizeof(int)); generate_grids(gridpatches_y, gridpatches_z, acc_ids); for(int snap3d_id = 0; snap3d_id < NUM_3DSNAPS; snap3d_id++) { int t_offset = snap3d_id * NUM_2DSNAPS * N_HR*N_HR; // put first PIV get_onesnap(x_fusion_hf_all, x_current_hf, t_offset + 0 * N_HR * N_HR, t_offset + 1 * N_HR * N_HR - 1); put_onesnap(x_rec_all, x_current_hf, t_offset + 0 * N_HR * N_HR, t_offset + 1 * N_HR * N_HR - 1); int block_id; for(block_id = 0; block_id < NUM_BLOCKS; block_id++) { double time_start = omp_get_wtime(); int t_first = SCALE_FACTOR_TIME*block_id; int t_last = SCALE_FACTOR_TIME*(block_id+1); // Put last PIV of the block get_onesnap(x_fusion_hf_all, x_current_hf, t_offset + t_last * N_HR * N_HR, t_offset + (t_last + 1) * N_HR * N_HR - 1); put_onesnap(x_rec_all, x_current_hf, t_offset + t_last * N_HR * N_HR, t_offset + (t_last + 1) * N_HR * N_HR - 1); if (SCALE_FACTOR_TIME % 2) { int t_bound1 = t_first + (int)SCALE_FACTOR_TIME/2; int t_bound2 = t_bound1 + 1; propag_forward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_first, t_bound1, t_offset); propag_backward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_last, t_bound2, t_offset); } else { int t_mid = t_first + (int)SCALE_FACTOR_TIME/2; int t_bound1 = t_mid - 1; int t_bound2 = t_mid + 1; propag_forward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_first, t_bound1, t_offset); propag_backward(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_last, t_bound2, t_offset); propag_2planes(x_rec_all, x_fusion_lf_all, gridpatches_y, gridpatches_z, acc_ids, t_mid, t_offset); printf("\n Estimated block %i (total 23) in 3D snapshot %i (total 37) in %f seconds \n", block_id, snap3d_id, (double)omp_get_wtime() - time_start); } } } // Write to file write_netcdf(FILENAME_WR, "x_rec_all", start_ids, count_ids, x_rec_all); /* free memory */ free(x_rec); free(x_current_lf); free(x_current_hf); free(x_rec_all); free(x_fusion_lf_all); free(x_fusion_hf_all); free(gridpatches_y); free(gridpatches_z); free(acc_ids); printf("\n FINISH ALL COMPUTATION IN %f SECONDS \n", (double)omp_get_wtime() - time_all_start); return 1; }
int main(int argc, char** argv) { const int n = NN; const int m = NM; const int iter_max = 1000; const double tol = 1.0e-6; double error = 1.0; int use_gpu = 1; memset(A, 0, n * m * sizeof(double)); memset(Anew, 0, n * m * sizeof(double)); for (int j = 0; j < n; j++) { A[j][0] = 1.0; Anew[j][0] = 1.0; } printf("Jacobi relaxation Calculation: %d x %d mesh\n", n, m); double st = omp_get_wtime(); int iter = 0; #pragma omp target data map(to:Anew) map(A) if(use_gpu) while ( error > tol && iter < iter_max ) { error = 0.0; #pragma omp target teams distribute parallel for reduction(max:error) map(error) if(target:use_gpu) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { Anew[j][i] = 0.25 * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmax( error, fabs(Anew[j][i] - A[j][i])); } } #pragma omp target teams distribute parallel for if(target:use_gpu) for( int j = 1; j < n-1; j++) { for( int i = 1; i < m-1; i++ ) { A[j][i] = Anew[j][i]; } } if(iter % 100 == 0) printf("%5d, %0.6f\n", iter, error); iter++; } double et = omp_get_wtime(); printf(" total: %f s\n", (et - st)); return 0; }
/* запускаем вычисления в несколько потоков * параметры: * thread_num - число потоков * b - правое краевое условие * arg - строковое предтавление числа b */ double run(int threads_num, double b, char* str_b) { int i, j, step,k; double* y = calloc(N + 1, sizeof(double)); /* сеточное решение */ double* dy = calloc(N + 1, sizeof(double)); /* разность y^n-y^n+1 двух соседних приближений по итерациям метода Ньютона */ double *A[R], *B[R], *C[R], *G[R]; /* коэффициенты трёхдиагональной системы для каждого шага редукции */ double begin, end; omp_set_dynamic(0); /* нельзя динамически изменять количество нитей */ omp_set_num_threads(threads_num); /* 4 нити */ for(i = 0; i < R; i++) { A[i] = calloc(N + 1, sizeof(double)); B[i] = calloc(N + 1, sizeof(double)); C[i] = calloc(N + 1, sizeof(double)); G[i] = calloc(N + 1, sizeof(double)); } begin = omp_get_wtime(); /* начальная точка отсчёта времени */ for( k = 0; k < REPEATS; k++){ #pragma omp parallel private(i, j) { #pragma omp for for(i = 0; i <= N; i++) y[i] = 1.0 + (b - 1.0) * i / N; /* нулевое приближение */ #pragma omp single { dy[0] = dy[N] = 0.0; for(j = 0; j < R; j++) B[j][0] = B[j][N] = 1.0; /* при редукции крайние значения матрицы одни и те же во всех итерациях метода Ньютона */ } while(1) /* итерации метода Ньютона в цикле */ { #pragma omp for for(i = 1; i < N; i++) /* изначальные значения коэффициентов */ { B[0][i] = (-2.0 / (h * h) - 5 * exp(y[i]) / 6); A[0][i] = (1.0 / (h * h) - exp(y[i - 1]) / 12); C[0][i] = (1.0 / (h * h) - exp(y[i + 1]) / 12); G[0][i] = F(y, b, i); } for(j = 1; j < R; j++) /* значения коэффициентов после редукции */ { step = pow(2, j); /* шаг прогонки при редукции */ #pragma omp for for(i = step; i < N; i += step) { B[j][i] = B[j - 1][i] - A[j - 1][i] * C[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * A[j - 1][i + step / 2] / B[j - 1][i + step / 2]; A[j][i] = - A[j - 1][i] * A[j - 1][i - step / 2] / B[j - 1][i - step / 2]; C[j][i] = - C[j - 1][i] * C[j - 1][i + step / 2] / B[j - 1][i + step / 2]; G[j][i] = G[j - 1][i] - A[j - 1][i] * G[j - 1][i - step / 2] / B[j - 1][i - step / 2] - C[j - 1][i] * G[j - 1][i + step / 2] / B[j - 1][i + step / 2]; } } /* редукция прогонки завершена */ #pragma omp single { dy[N / 2] = G[R - 1][N / 2] / B[R - 1][N / 2]; /* первый обратный шаг редукции */ dy[N / 4] = (G[R - 2][N / 4] - C[R - 2][N / 4] * dy[N / 2]) / B[R - 2][N / 4]; dy[N * 3 / 4] = (G[R - 2][N * 3 / 4] - A[R - 2][N * 3 / 4] * dy[N / 2] ) / B[R - 2][N * 3 / 4]; /* второй обратный шаг редукции */ } for(j = R - 3; j >= 0; j--) { step = pow(2, j); #pragma omp for for(i = step; i < N; i += 2 * step) dy[i] = (G[j][i] - C[j][i] * dy[i + step] - A[j][i] * dy[i - step]) / B[j][i]; } /* оставшиеся обратные шаги редукции */ #pragma omp for for(i = 0; i <= N; i++) y[i] -= dy[i]; /* одна итерация метода Ньютона */ if (norm(dy) < epsilon) break; /* условие останова метода Ньютона */ } } } end = omp_get_wtime(); /* конечная точка отсчёта времени */ for(i = 0; i < R; i++) { free(A[i]); free(B[i]); free(C[i]); free(G[i]); } char str_dest[50]; FILE* fp = fopen(strcat(strcpy(str_dest, str_b), "par_result.txt"), "w"); /* вывод полученной функции в файл */ fprintf(fp, "X\tY\r\n"); for(i = 0; i <= N; i++) fprintf(fp, "%e\t%e\r\n", ((double) i / N), y[i]); fclose(fp); free(y); free(dy); return (end - begin)/REPEATS; }
/** * Main function **/ int main() { srand (time(NULL)); /* Tree x; x.set(0,4); x.set(1,1); x.set(2,9); x.set(3,2); x.set(4,14); x.set(5,8); x.set(6,13); x.set(7,0); x.set(8,3); x.set(9,12); x.set(10,10); x.set(11,5); x.set(12,7); x.set(13,6); x.set(14,11); //getMax at 1 -> 14 std::cout << x.getMax(0) << std::endl; std::cout << x.getMin(0) << std::endl; std::cout << x.fitness(0) << std::endl; std::cout << x.fitness(3) << std::endl; */ mc=0; std::array<Tree, 200> population; //init population for(int i=0; i<100; i++) { Tree x; x.init(); population[i] = x; } int count = 0; MUTATION_RATE=5; //alter population double start = omp_get_wtime(); while(population[0].fitness(0) > 0.000) { count++; std::list<std::pair<int,double>> fitnesses; //create offspring for(int j=0; j<100; j++) { int p1 = rand() % 100; int p2 = rand() % 100; Tree kid = population[p1].combine(population[p2]); kid.mutate(); population[100+j] = kid; } //calc fitness for(int k=0; k<200; k++) { auto x = std::make_pair(k, population[k].fitness(0)); fitnesses.push_back(x); } //sort by value fitnesses.sort(sort_pred()); //remove old population std::array<Tree, 100> newpop; for(int j=0; j<100; j++) { int newPos = fitnesses.back().first; fitnesses.pop_back(); newpop[j] = population[newPos]; } //clear old population and take the new ones if(count%10000==0) { double end = omp_get_wtime(); std::cout << "Iteration count: " << count << " in " << (end-start) << std::endl; std::cout << "Best fitness: " << population[0].fitness(0) << std::endl; std::cout << "Weakest fitness: " << population[199].fitness(0) << std::endl; std::cout << "Mutation rate: " << MUTATION_RATE << std::endl; std::cout << "Mutation count: " << mc << std::endl; population[0].print(); std::cout << std::endl; start = omp_get_wtime(); } for(int i=0; i<100; i++) { population[i] = newpop[i]; } } std::cout << "found solution with " << count << " iterations.n"; population[0].print(); return 0; }
int main(int argc, char **argv){ bool error = false; //get NSUB, threads, tasks and trails from argument if(argc != 4){ error = true; } else if((NSUB = atoi(argv[1])) == 0) { printf("Invalid subdivison size.\n"); error = true; } else if ((NL = atoi(argv[2])) == 0){ printf("Invalid base function degree.\n"); error = true; } else if ((THREADS = atoi(argv[3])) == 0){ printf("Invalid number of threads.\n"); error = true; } if(error){ printf("Usage: mpirun -np [TASKS] new [SUB_SIZE] [NL] [NUM_THREADS]\n"); exit(EXIT_FAILURE); } if((fp_out = fopen("new_out.txt", "a")) == NULL || (fp_sol = fopen("new_sol.txt", "a")) == NULL){ printf("New Version files not found.\n"); exit(EXIT_FAILURE); } //Allocate array memory adiag = (double *)malloc(sizeof(double)*(double)(NSUB+1)); aleft = (double *)malloc(sizeof(double)*(double)(NSUB+1)); arite = (double *)malloc(sizeof(double)*(double)(NSUB+1)); f = (double *)malloc(sizeof(double)*(double)(NSUB+1)); h = (double *)malloc(sizeof(double)*(double)(NSUB)); indx = (int *)malloc(sizeof(int)*(int)(NSUB+1)); node = (int *)malloc(sizeof(int)*((int)NL*(int)NSUB)); xn = (double *)malloc(sizeof(double)*(double)(NSUB+1)); xquad = (double *)malloc(sizeof(double)*(double)(NSUB)); //START TIMER// double begin, end, time_spent; begin = omp_get_wtime(); //set number of threads omp_set_num_threads(THREADS); /****************** MPI Initialisations ***************/ MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); if(provided != MPI_THREAD_FUNNELED){ return 1; } MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* set up block sizes for MPI work */ slaveSize1 = (NSUB+1) / numprocs; masterSize1 = slaveSize1 + ((NSUB+1) % numprocs); slaveSize2 = NSUB / numprocs; masterSize2 = slaveSize2 + (NSUB % numprocs); printf("MPI: Process %d of %d\n", rank, numprocs); /* If we are the master process Master coordinates the slaves */ if (rank == MASTER){ printf("MASTER: Number of processes is: %d\n",numprocs); timestamp (); fprintf (fp_out, "\n" ); fprintf (fp_out, "FEM1D\n" ); fprintf (fp_out, " C version\n" ); fprintf (fp_out, "\n" ); fprintf (fp_out, " Solve the two-point boundary value problem\n" ); fprintf (fp_out, "\n" ); fprintf (fp_out, " - d/dX (P dU/dX) + Q U = F\n" ); fprintf (fp_out, "\n" ); fprintf (fp_out, " on the interval [XL,XR], specifying\n" ); fprintf (fp_out," the value of U or U' at each end.\n" ); fprintf (fp_out, "\n" ); fprintf (fp_out," The interval [XL,XR] is broken into NSUB = %ld subintervals\n", NSUB ); fprintf (fp_out, " Number of basis functions per element is NL = %ld\n", NL ); } //Initialize the data. init (); //Compute the geometric quantities. geometry (); //Assemble the linear system. assemble (); if(rank == MASTER){ //Print out the linear system. prsys (); //Solve the linear system. solve (); //Print out the solution. output (); } //Terminate. fprintf (fp_out, "\n" ); fprintf (fp_out,"FEM1D:\n" ); fprintf (fp_out, " Normal end of execution.\n" ); fprintf ( fp_out,"\n" ); //END TIMER// end = omp_get_wtime(); time_spent = end - begin; timestamp ( ); //CLOSE STREAMS fclose(fp_out); fclose(fp_sol); //FREE MEMORY free(adiag); free(aleft); free(arite); free(f); free(h); free(indx); free(node); free(xn); free(xquad); MPI_Finalize(); if(rank == MASTER){ FILE *fp_time = fopen("times.txt","a"); fprintf(fp_time, "%f\n", time_spent); } return 0; }
triangulation triangulate_cube(data_list * data, char * tmp_triang_file, char * tmp_data_file) { printf("%s %s\n", tmp_triang_file, tmp_data_file); triangulation result = triangulation_init(data_list_dim(data)); cube_points cube = gen_cube_points(result.dim); facet_acute_data parameters; parameters.cube = &cube; parameters.boundary_func = &triangle_boundary_cube; parameters.data = data; parameters.store_acute_ind = 1; parameters.acute_ind = malloc(sizeof(unsigned short) * cube.len); //This list holds all conform tetrahedrons for a given triangle, max size = cube.len ptetra tet_list = malloc(sizeof(tetra) * cube.len); unsigned short tet_list_len = 0; //Lists needed for the dynamic_remove loop tri_list check_list, check_list_new; check_list = tri_list_init(result.dim, MEM_LIST_FALSE); check_list_new = tri_list_init(result.dim, MEM_LIST_FALSE); //Start triangle (0,0,0), (rand,0,0), (rand,rand,0) result.bound_len = 1; result.bound_tri = triangulation_start_facet(data); printf("Starting triangulation with facet:\n"); print_triangle(result.bound_tri); /* * During this method we are going to operate data that is not thread-safe. * To avoid race conditions we need an array of locks. We use a lock for the * first two points of a triangle (so need 2d array of locks). */ omp_lock_t ** locks = malloc(sizeof(omp_lock_t *) * cube.len); //Initalize the locks for (size_t i = 0; i < cube.len; i++){ locks[i] = malloc(sizeof(omp_lock_t) * (cube.len - i)); for (size_t j = 0; j < cube.len - i; j++) omp_init_lock(&locks[i][j]); } //While we have triangles on the boundary.. while (result.bound_len > 0) { tri_list_empty(&check_list); tri_list_empty(&check_list_new); /* * We are going to add a tetrahedron on the boundary triangle. * To do so, we select a random triangle on the boundary. Then we generate all the * acute tetrahedra (above and below) with facets in our possible list. * From this list we remove all the tetrahedrons that intersect with our current triangulation. * Then we add a random tetrahedron to our triangulation, update the conform list and repeat. */ int rand_bound = rand() % result.bound_len; printf("\n\nTotal amount of triangles left:%zu\nExpanding triangulation at boundary triangle: \n", data_list_count(data)); print_triangle(result.bound_tri + rand_bound); //Calculate the conform tetrahedrons above and below if (!facet_conform(&result.bound_tri[rand_bound], ¶meters)) { printf("We have a triangle on the boundary that is not conform anymore.\n"); printf("Whatthefuck? Breaking!\n"); break; } tet_list_len = parameters.acute_ind_len; printf("Total amount of conform tetrahedrons found for this boundary: %hu\n", tet_list_len); //Form explicit list of the tetrahedrons for (unsigned short i = 0; i < tet_list_len; i++) { copyArr3(tet_list[i].vertices[0], result.bound_tri[rand_bound].vertices[0]); copyArr3(tet_list[i].vertices[1], result.bound_tri[rand_bound].vertices[1]); copyArr3(tet_list[i].vertices[2], result.bound_tri[rand_bound].vertices[2]); copyArr3(tet_list[i].vertices[3], cube.points[parameters.acute_ind[i]]); } //Remove all the tetrahedrons that intersect with current triangulation. filter_tet_list_disjoint_triangulation(tet_list, &tet_list_len, &result); printf("Amount of tetrahedrons left after filtering: %hu\n\n",tet_list_len); if (tet_list_len == 0) { printf("Waarom is deze lijst nu al f*****g leeggefilterd?\n"); printf("Dead end, helaas pindakaas. Got to %zu\n", result.tetra_len); break; } //Select random tetrahedron disjoint with the current triangulation int rand_tet = rand() % tet_list_len; /* * Add the above tetra to the triangulation. * This removes all the boundary triangles that are covered by this tetrahedron */ printf("Adding the following tetra to the triangulation\n"); print_tetra(tet_list + rand_tet); printf("\n\n"); add_tet_triangulation(tet_list + rand_tet, &result); triangulation_print(&result); if (!result.bound_len) //If we have no boundaries left, we must be done!! { printf("No more boundaries left.. WE FINNISHED!??\n"); break; } //Consistency check if (!triangulation_consistent(&result, ¶meters)) { printf("Triangulation not consistent after adding the tetrahedron. Breaking.\n"); break; } /* * Calculate a list of all the triangles we are going to remove */ double time_removed = omp_get_wtime(); printf("Removing triangles not disjoint with new tetrahedron\n"); size_t removed = filter_intersection_data_list_tet(data, &check_list, tet_list + rand_tet, locks); printf("Removed %zu triangles that are not disjoint with the new tetrahedron\n", removed); printf("The check_list has size %zu\n", tri_list_count(&check_list)); printf("Time took to removed triangles: %g seconds\n", omp_get_wtime()-time_removed); if (!triangulation_consistent(&result, ¶meters)) { printf("After filtering the memory list we have a non consistent triangulation. Break\n"); break; } //Do two iterations facets_conform_dynamic_remove(data, &result, 1, &check_list, &check_list_new, locks); if (!triangulation_consistent(&result, ¶meters)) { printf("Triangulation not consistent anymore after conforming the data set.. Breaking\n"); break; } /*mem_list_cube_compress(&data->mem_list); if (tmp_triang_file && tmp_data_file) { triangulation_to_file(&result, tmp_triang_file); data_list_to_file(data, tmp_data_file, MEM_LIST_SAVE_CLEAN); } */ } for (size_t i = 0; i < cube.len; i++){ for (size_t j = 0; j < cube.len - i; j++) omp_destroy_lock(&locks[i][j]); free(locks[i]); } free(locks); free(cube.points); free(parameters.acute_ind); free(tet_list); tri_list_free(&check_list); tri_list_free(&check_list_new); printf("Triangulation has length of %zu\n", result.tetra_len); return result; }
int main() { int **a,**b,**c; // Variable for saving memory allocation int a_r,a_c,b_r,b_c, nthreads, tid, chunk =10; //Variables for Matrix dimensions and for OpenMP functions double dif; //For time difference calculation int i,j,k; again: printf("Enter number of Rows & Columns for Matrix 1: \n"); scanf("%d%d",&a_r,&a_c); printf("Enter number of Rows & Columns for Matrix 2: \n"); scanf("%d%d",&b_r,&b_c); if(a_c!=b_r ) { printf("\ncan not multiply"); goto again; } // Memory allocation for Matrix 1 a=(int **) malloc(10*a_r); for( i=0;i<a_c; i++) { a[i]=(int *) malloc(10*a_c); } // Memory allocation for Matrix 2 b=(int **) malloc(10*b_r); for( i=0;i<b_c; i++) { b[i]=(int *) malloc(10*b_c); } // Memory allocation for Product Matrix c=(int **) malloc(10*a_r); for( i=0;i< b_c; i++) { c[i]=(int *) malloc(10*b_c); } printf("Matrix default Initialization\n"); printf("Clock Started\n"); //Setting up Clock double start = omp_get_wtime( ); //Code Parallelization Initiated #pragma omp parallel shared(a,b,c,nthreads,chunk) private(tid,i,j,k) { tid = omp_get_thread_num(); if (tid == 0) { nthreads = omp_get_num_threads(); printf("Starting matrix multiple example with %d threads\n",nthreads); } //Initialization of Matrix 1 #pragma omp for schedule (static, chunk) for(i=0;i<a_r; i++) { for(j=0;j<a_c; j++) { a[i][j] = i+j; } } //Initialization of Matrix 2 #pragma omp for schedule (static, chunk) for(i=0;i<b_r; i++) { for(j=0;j<b_c; j++) { b[i][j] = i*j; } } //Initialization of Product Matrix 3 #pragma omp for schedule (static, chunk) for(i=0;i<a_r; i++) { for(j=0;j< b_c; j++) { c[i][j]=0; } } //Matrix Multiplication printf("Thread %d starting Matrix Multiply\n",tid); #pragma omp for schedule (static, chunk) for(i=0; i<a_r; i++) { printf("Thread %d performed row = %d \n",tid,i); for(j=0;j<a_c; j++) { for(k=0;k<b_c; k++) { c[i][j]=c[i][j]+a[i][k]*b[k][j]; } } } } printf("Matrix Multiplication Done\n"); //Stop Timer double end = omp_get_wtime( ); dif = end - start; printf("Parallelization took %f Seconds\n", dif); // Releasing Memory for(i=0;i<a_r; i++) { free(a[i]); } free(a); for(i=0;i<a_c; i++) { free(b[i]); } free(b); for(i=0;i<b_c; i++) { free(c[i]); } free(c); }
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) { int ret_code = 0; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } } char *output_directory = shared_options_data->output_directory; size_t output_directory_len = strlen(output_directory); ret_code = create_directory(output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", output_directory); } // Remove all .txt files in folder ret_code = delete_files_by_extension(output_directory, "txt"); if (ret_code != 0) { return ret_code; } // Initialize environment for connecting to the web service ret_code = init_http_environment(0); if (ret_code != 0) { return ret_code; } // Output file descriptors static cp_hashtable *output_files = NULL; // Lines of the output data in the main .txt files static list_t *output_list = NULL; // Consequence type counters (for summary, must be kept between web service calls) static cp_hashtable *summary_count = NULL; // Gene list (for genes-with-variants, must be kept between web service calls) static cp_hashtable *gene_list = NULL; // Initialize collections of file descriptors and summary counters ret_code = initialize_output_files(output_directory, output_directory_len, &output_files); if (ret_code != 0) { return ret_code; } initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list); initialize_ws_buffers(shared_options_data->num_threads); // Create job.status file char job_status_filename[output_directory_len + 10]; sprintf(job_status_filename, "%s/job.status", output_directory); FILE *job_status = new_job_status_file(job_status_filename); if (!job_status) { LOG_FATAL("Can't create job status file\n"); } else { update_job_status_file(0, job_status); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 1, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); // Filters and files for filtering output filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); // Pedigree information (used in some filters) individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Filename structure outdir/vcfname.errors char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char)); get_filename_from_path(shared_options_data->vcf_filename, prefix_filename); char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char)); sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename); non_processed_file = fopen(non_processed_filename, "w"); free(non_processed_filename); // Maximum size processed by each thread (never allow more than 1000 variants per query) if (shared_options_data->batch_lines > 0) { shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads)); } else { shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY; } LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread); int i = 0; vcf_batch_t *batch = NULL; int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0; start = omp_get_wtime(); while (batch = fetch_vcf_batch(vcf_file)) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); } LOG_DEBUG("VCF header written\n"); if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); } } // printf("batch loaded = '%.*s'\n", 50, batch->text); // printf("batch text len = %zu\n", strlen(batch->text)); // if (i % 10 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); // } int reconnections = 0; int max_reconnections = 3; // TODO allow to configure? // Write records that passed to a separate file, and query the WS with them as args array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes; int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes); do { // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { int tid = omp_get_thread_num(); LOG_DEBUG_F("[%d] WS invocation\n", tid); LOG_DEBUG_F("[%d] -- effect WS\n", tid); if (!reconnections || ret_ws_0) { ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j], options_data->excludes); parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list); free(effect_line[tid]); effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char)); } if (!options_data->no_phenotypes) { if (!reconnections || ret_ws_1) { LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num()); ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_snp_phenotype_response(tid, output_list); free(snp_line[tid]); snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char)); } if (!reconnections || ret_ws_2) { LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num()); ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_mutation_phenotype_response(tid, output_list); free(mutation_line[tid]); mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char)); } } } LOG_DEBUG_F("*** %dth web services invocation finished\n", i); if (ret_ws_0 || ret_ws_1 || ret_ws_2) { if (ret_ws_0) { LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0)); } if (ret_ws_1) { LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1)); } if (ret_ws_2) { LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2)); } // In presence of errors, wait 4 seconds before retrying reconnections++; LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections); sleep(4); } else { free(chunk_starts); free(chunk_sizes); } } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)); } // If the maximum number of reconnections was reached still with errors, // write the non-processed batch to the corresponding file if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) { #pragma omp critical { write_vcf_batch(batch, non_processed_file); } } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (failed_file) { fclose(failed_file); } if (non_processed_file) { fclose(non_processed_file); } // Free filters for (i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread which writes the results to all_variants, summary and one file per consequence type int ret = 0; char *line; list_item_t* item = NULL; FILE *fd = NULL; FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants"); FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes"); FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes"); while ((item = list_remove_item(output_list)) != NULL) { line = item->data_p; // Type greater than 0: consequence type identified by its SO code // Type equals to -1: SNP phenotype // Type equals to -2: mutation phenotype if (item->type > 0) { // Write entry in the consequence type file fd = cp_hashtable_get(output_files, &(item->type)); int ret = fprintf(fd, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to file: '%s'\n", line); } // Write in all_variants ret = fprintf(all_variants_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to all_variants: '%s'\n", line); } } else if (item->type == SNP_PHENOTYPE) { ret = fprintf(snp_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line); } } else if (item->type == MUTATION_PHENOTYPE) { ret = fprintf(mutation_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line); } } free(line); list_item_free(item); } } } write_summary_file(summary_count, cp_hashtable_get(output_files, "summary")); write_genes_with_variants_file(gene_list, output_directory); write_result_file(shared_options_data, options_data, summary_count, output_directory); free_output_data_structures(output_files, summary_count, gene_list); free_ws_buffers(shared_options_data->num_threads); free(output_list); vcf_close(vcf_file); update_job_status_file(100, job_status); close_job_status_file(job_status); return ret_code; }
int main ( int argc, char *argv[] ) /******************************************************************************/ /* Purpose: MAIN is the main program for SCHEDULE_OPENMP. Discussion: This program demonstrates the difference between default, static and dynamic scheduling for a loop parallelized in OpenMP. The purpose of scheduling is to deal with loops in which there is known or suspected imbalance in the work load. In this example, if the work is divided in the default manner between two threads, the second thread has 3 times the work of the first. Both static and dynamic scheduling, if used, even out the work so that both threads have about the same load. This could be expected to decrease the run time of the loop by about 1/3. Licensing: This code is distributed under the GNU LGPL license. Modified: 10 July 2010 Author: John Burkardt */ { int n; int n_factor; int n_hi; int n_lo; int primes; double time1; double time2; double time3; printf ( "\n" ); printf ( "SCHEDULE_OPENMP\n" ); printf ( " C/OpenMP version\n" ); printf ( " Count the primes from 1 to N.\n" ); printf ( " This is an unbalanced work load, particular for two threads.\n" ); printf ( " Demonstrate default, static and dynamic scheduling.\n" ); printf ( "\n" ); printf ( " Number of processors available = %d\n", omp_get_num_procs ( ) ); printf ( " Number of threads = %d\n", omp_get_max_threads ( ) ); n_lo = 1; n_hi = 131072; n_factor = 2; printf ( "\n" ); printf ( " Default Static Dynamic\n" ); printf ( " N Pi(N) Time Time Time\n" ); printf ( "\n" ); n = n_lo; while ( n <= n_hi ) { time1 = omp_get_wtime ( ); primes = prime_default ( n ); time1 = omp_get_wtime ( ) - time1; time2 = omp_get_wtime ( ); primes = prime_static ( n ); time2 = omp_get_wtime ( ) - time2; time3 = omp_get_wtime ( ); primes = prime_dynamic ( n ); time3 = omp_get_wtime ( ) - time3; printf ( " %8d %8d %12f %12f %12f\n", n, primes, time1, time2, time3 ); n = n * n_factor; } /* Terminate. */ printf ( "\n" ); printf ( "SCHEDULE_OPENMP\n" ); printf ( " Normal end of execution.\n" ); return 0; }
int main(int argc, char const *argv[]) { int matrixSize = strtol(argv[1], NULL, 10); int coreCount = omp_get_num_procs(); int threadCount = strtol(argv[2], NULL, 10); double startTime, finishTime; double **a_augmented, **a; // n x n Matrix as a 2D array double diagonalElement, bestElement, factor; int bestRowIndex = 0; // used in partial pivoting (index of row having greatest absolute value) int i, j, k; // for loop counters double *x; // Solutions double *b; printf("Matrix Size: %d\n", matrixSize); printf("Number of Cores: %d\n", coreCount); #pragma omp parallel num_threads(threadCount) { if (omp_get_thread_num() == 0) printf("Thread Count: %d\n", omp_get_num_threads()); } // Start Timer startTime = omp_get_wtime(); // Allocate memory // a_augmented will be the augmented matrix a_augmented = (double **) malloc(matrixSize * sizeof(double *)); // a will be the randomly generated matrix a = (double **) malloc(matrixSize * sizeof(double *)); x = (double *) malloc(matrixSize * sizeof(double)); b = (double *) malloc(matrixSize * sizeof(double)); if (DEBUG == 1) Read_matrix(&a, &a_augmented, matrixSize); else Gen_matrix(&a, &a_augmented, matrixSize, threadCount); // a will not be modified after this point // Only the a_augmented will be modified // Display generated matrix: displayMatrix(a, matrixSize); for (i = 0; i < matrixSize - 1; ++i) { // Partial Pivoting: // the algorithm selects the entry with largest absolute value from // the column of the matrix that is currently being considered as // the pivot element. // Diagonal Element diagonalElement = a_augmented[i][i]; // debug_printf("diagonalElement%d = %f\n", i, diagonalElement); // Find the best row (the one with the largest absolute value in the // column being worked on) bestRowIndex = i; bestElement = diagonalElement; for (j = i + 1; j < matrixSize; ++j) { if (fabs(a_augmented[j][i]) > fabs(bestElement)) { bestRowIndex = j; bestElement = a_augmented[j][i]; // debug_printf("bestElement = %f\n", a_augmented[j][i]); } } // Swap the rows if (i != bestRowIndex) { // debug_printf("Row %d needs to be swapped with Row %d\n", i, bestRowIndex ); swapRow(&a_augmented[i], &a_augmented[bestRowIndex]); // Update the diagonal element diagonalElement = a_augmented[i][i]; // debug_printf("diagonalElement%d = %f\n", i, diagonalElement); // displayMatrix(a_augmented, matrixSize); } // End of Partial Pivoting // To make the diagonal element 1, // divide the whole row with the diagonal element // debug_printf("Row %d = Row %d / %f\n", i, i, diagonalElement); for (j = 0; j < matrixSize + 1; ++j) { a_augmented[i][j] = a_augmented[i][j] / diagonalElement; } // Force the diagonal to be 1 (to avoid any roundoff errors in dividing above) a_augmented[i][i] = 1; diagonalElement = 1; // debug_printf("Annihilation of column %d...\n", i); // Annihilation: Zero all the elements in the column below the diagonal element #pragma omp parallel for num_threads(threadCount) \ default(none) private(j, factor, k) shared(i, matrixSize, a_augmented) for (j = i + 1; j < matrixSize; ++j) { // sleep(1); factor = a_augmented[j][i]; if (factor != 0) { // debug_printf("Row %d = Row %d - %f*Row %d\n", j, j, factor, i); for (k = i; k < matrixSize + 1; ++k) { a_augmented[j][k] = a_augmented[j][k] - factor * a_augmented[i][k]; } // displayAugmentedMatrix(a, matrixSize); } } } // Make the diagonal element of the last row 1 a_augmented[matrixSize-1][matrixSize] = a_augmented[matrixSize-1][matrixSize] / a_augmented[matrixSize-1][matrixSize-1]; a_augmented[matrixSize-1][matrixSize-1] = 1; // Display augmented matrix: displayMatrix(a_augmented, matrixSize); // Back substitution (parallelized) backSubstitution(&a_augmented, matrixSize, threadCount); // Record the finish time finishTime = omp_get_wtime(); displayMatrix(a_augmented, matrixSize); // Matrix X from augmented matrix // Vector b from matrix A for (i = 0; i < matrixSize; ++i) { x[i] = a_augmented[i][matrixSize]; b[i] = a[i][matrixSize]; } // Find I^2 norm iSquaredNorm(&a, x, b, matrixSize, threadCount); // Print the time taken printf("Time taken = %f\n", finishTime - startTime); // Free memory for (i = 0; i < matrixSize; ++i) { free(a[i]); free(a_augmented[i]); } free(a); free(a_augmented); free(x); free(b); return 0; }
int main(int argc, char* argv[]) { size_t ndevices = 0; if (LIBXSTREAM_ERROR_NONE != libxstream_get_ndevices(&ndevices) || 0 == ndevices) { LIBXSTREAM_PRINT0(2, "No device found or device not ready!"); } size_t filesize = 0; FILE *const file = 1 < argc ? fileopen(argv[1], "rb", &filesize) : 0; const size_t nitems = (1 < argc && 0 == filesize && 0 < atoi(argv[1])) ? (atoi(argv[1]) * (1ULL << 20)/*MB*/) : (0 < filesize ? filesize : (512 << 20)); const size_t mbatch = LIBXSTREAM_MIN(2 < argc ? strtoul(argv[2], 0, 10) : 0/*auto*/, nitems >> 20) << 20; const size_t mstreams = LIBXSTREAM_MIN(LIBXSTREAM_MAX(3 < argc ? atoi(argv[3]) : 2, 0), LIBXSTREAM_MAX_NSTREAMS); #if !defined(_OPENMP) LIBXSTREAM_PRINT0(1, "OpenMP support needed for performance results!"); #endif const size_t nstreams = LIBXSTREAM_MAX(mstreams, 1) * LIBXSTREAM_MAX(ndevices, 1), nbatch = (0 == mbatch) ? (nitems / nstreams) : mbatch, hsize = 256; size_t histogram[256/*hsize*/]; memset(histogram, 0, sizeof(histogram)); char* data; { /*allocate and initialize host memory*/ size_t i; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(-1/*host*/, (void**)&data, nitems, 0)); if (0 == filesize || nitems > fread(data, 1, filesize, file)) { for (i = 0; i < nitems; ++i) data[i] = (char)LIBXSTREAM_MOD(rand(), hsize/*POT*/); } } struct { libxstream_stream* handle; #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) libxstream_event* event; #endif size_t* histogram; char* data; } stream[(LIBXSTREAM_MAX_NDEVICES)*(LIBXSTREAM_MAX_NSTREAMS)]; { /*allocate and initialize streams and device memory*/ size_t i; for (i = 0; i < nstreams; ++i) { #if defined(NDEBUG) /*no name*/ const char *const name = 0; #else char name[128]; LIBXSTREAM_SNPRINTF(name, sizeof(name), "stream %i", (int)(i + 1)); #endif const int device = (0 < ndevices) ? ((int)(i % ndevices)) : -1; stream[i].handle = 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_create(0 < mstreams ? &stream[i].handle : 0, device, 0, name)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].data, nbatch, 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_allocate(device, (void**)&stream[i].histogram, hsize * sizeof(size_t), 0)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memset_zero(stream[i].histogram, hsize * sizeof(size_t), stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_create(&stream[i].event)); #endif } /*start benchmark with no pending work*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(0)); } /*process data in chunks of size nbatch*/ const size_t nstep = nbatch * nstreams; const int end = (int)((nitems + nstep - 1) / nstep); int i; libxstream_type sizetype = LIBXSTREAM_TYPE_U32; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_autotype(sizeof(size_t), sizetype, &sizetype)); #if defined(_OPENMP) /*if (0 == ndevices) omp_set_nested(1);*/ const double start = omp_get_wtime(); #endif for (i = 0; i < end; ++i) { const size_t ibase = i * nstep, n = LIBXSTREAM_MIN(nstreams, nitems - ibase); libxstream_argument* signature; size_t j; for (j = 0; j < n; ++j) { /*enqueue work into streams*/ const size_t base = ibase + j * nbatch, size = base < nitems ? LIBXSTREAM_MIN(nbatch, nitems - base) : 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_h2d(data + base, stream[j].data, size, stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_signature(&signature)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_input(signature, 0, stream[j].data, LIBXSTREAM_TYPE_CHAR, 1, &size)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_output(signature, 1, stream[j].histogram, sizetype, 1, &hsize)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_fn_call((libxstream_function)makehist, signature, stream[j].handle, LIBXSTREAM_CALL_DEFAULT)); #if defined(SYNCMETHOD) && (2 <= SYNCMETHOD) /*record event*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_record(stream[j].event, stream[j].handle)); #endif } #if defined(SYNCMETHOD) for (j = 0; j < n; ++j) { /*synchronize streams*/ const size_t k = n - j - 1; /*j-reverse*/ # if (3 <= (SYNCMETHOD)) /*wait for an event within a stream*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait_event(stream[k].handle, stream[(j+nstreams-1)%n].event)); # elif (2 <= (SYNCMETHOD)) /*wait for an event on the host*/ LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_wait(stream[k].event)); # else LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[k].handle)); # endif } #endif } { /*reduce stream-local histograms*/ LIBXSTREAM_ALIGNED(size_t local[256/*hsize*/], LIBXSTREAM_MAX_SIMD); size_t i, j; for (j = 0; j < nstreams; ++j) { LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_memcpy_d2h(stream[j].histogram, local, sizeof(local), stream[j].handle)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_wait(stream[j].handle)); /*wait for pending work*/ for (i = 0; i < hsize; ++i) histogram[i] += local[i]; } } #if defined(_OPENMP) const double duration = omp_get_wtime() - start; #endif const double kilo = 1.0 / (1 << 10), mega = 1.0 / (1 << 20); double entropy = 0; { /*calculate entropy*/ const double log2_nitems = log2((double)nitems); size_t i; for (i = 0; i < hsize; ++i) { const double h = (double)histogram[i], log2h = 0 < h ? log2(h) : log2_nitems; entropy -= h * LIBXSTREAM_MIN(log2h - log2_nitems, 0); } entropy /= nitems; } if (0 < entropy) { if ((1 << 20) <= nitems) { /*mega*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f MB", 8.0 / entropy, mega * nitems, mega * entropy * nitems / 8.0); } else if ((1 << 10) <= nitems) { /*kilo*/ fprintf(stdout, "Compression %gx: %.1f -> %.1f KB", 8.0 / entropy, kilo * nitems, kilo * entropy * nitems / 8.0); } else { fprintf(stdout, "Compression %gx: %.0f -> %0.f B", 8.0 / entropy, 1.0 * nitems, entropy * nitems / 8.0); } fprintf(stdout, " (redundancy %0.f%%, entropy %.0f bit)\n", 100.0 - 12.5 * entropy, entropy); } #if defined(_OPENMP) if (0 < duration) { fprintf(stdout, "Finished after %.1f s", duration); } else { fprintf(stdout, "Finished"); } #endif { /*validate result*/ size_t check = 0, i; for (i = 0; i < hsize; ++i) check += histogram[i]; if (nitems != check) { size_t expected[256/*hsize*/]; memset(expected, 0, sizeof(expected)); LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, nitems, expected); check = 0; for (i = 0; i < hsize; ++i) check += expected[i] == histogram[i] ? 0 : 1; fprintf(stdout, " with %llu error%s\n", (unsigned long long)check, 1 != check ? "s" : ""); } else { fprintf(stdout, "\n"); } } { /*release resources*/ size_t i; for (i = 0; i < nstreams; ++i) { int device = -1; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_device(stream[i].handle, &device)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].histogram)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(device, stream[i].data)); LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_stream_destroy(stream[i].handle)); #if defined(SYNCMETHOD) && 2 <= (SYNCMETHOD) LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_event_destroy(stream[i].event)); #endif } LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_mem_deallocate(-1/*host*/, data)); } return EXIT_SUCCESS; }
void mm_tst_cases(int NTRIALS, int Ndim, int Mdim, int Pdim, TYPE* A, TYPE* B, TYPE* C, void (*mm_func)(int, int, int, TYPE *, TYPE *, TYPE *)) { int nerr, itrials; double err, errsq, mflops; double start_time, run_time; double min_t, max_t, ave_t; TYPE *Cref; Cref = (TYPE *) malloc (Ndim * Mdim * sizeof(TYPE)); /* Initialize matrices */ init_const_matrix (Ndim, Mdim, Pdim, A, B, Cref); printf("\n constant matrices %d %d %d\n", Ndim, Mdim, Pdim); nerr = 0; min_t = BIG; max_t = SMALL; ave_t = (double) 0.0; for (itrials = 0; itrials<NTRIALS; itrials++){ mm_clear(Ndim, Mdim, C); start_time = omp_get_wtime(); mm_func(Ndim, Mdim, Pdim, A, B, C); run_time = omp_get_wtime() - start_time; errsq = errsqr(Ndim, Mdim, C, Cref); if (errsq > TOL) nerr++; if(run_time < min_t) min_t = run_time; if(run_time > max_t) max_t = run_time; ave_t += run_time; } ave_t = ave_t/(double)NTRIALS; output_results(Ndim, Mdim, Pdim, nerr, ave_t, min_t, max_t); init_progression_matrix (Ndim, Mdim, Pdim, A, B, Cref); #ifdef DEBUG printf(" A progression Matrix input\n"); mm_print(Ndim, Pdim, A); printf(" B progression Matrix input\n"); mm_print(Pdim, Mdim, B); printf(" C Reference Matrix\n"); mm_print(Ndim, Mdim, Cref); #endif printf("\n progression matrices %d %d %d\n", Ndim, Mdim, Pdim); nerr = 0; min_t = BIG; max_t = SMALL; ave_t = (double) 0.0; for (itrials = 0; itrials<NTRIALS; itrials++){ mm_clear(Ndim, Mdim, C); start_time = omp_get_wtime(); mm_func(Ndim, Mdim, Pdim, A, B, C); run_time = omp_get_wtime() - start_time; #ifdef DEBUG printf(" C progression Matrix result\n"); mm_print(Ndim, Mdim, C); #endif errsq = errsqr(Ndim, Mdim, C, Cref); if (errsq > TOL) nerr++; if(run_time < min_t) min_t = run_time; if(run_time > max_t) max_t = run_time; ave_t += run_time; } ave_t = ave_t/(double)NTRIALS; output_results(Ndim, Mdim, Pdim, nerr, ave_t, min_t, max_t); }
int main (int argc, char *argv[]) { int tid, nthreads, i, j, k; double **a, **b, **c; double *a_block, *b_block, *c_block; double **res; double *res_block; double starttime, stoptime; a = (double **) malloc(NRA*sizeof(double *)); /* matrix a to be multiplied */ b = (double **) malloc(NCA*sizeof(double *)); /* matrix b to be multiplied */ c = (double **) malloc(NRA*sizeof(double *)); /* result matrix c */ a_block = (double *) malloc(NRA*NCA*sizeof(double)); /* Storage for matrices */ b_block = (double *) malloc(NCA*NCB*sizeof(double)); c_block = (double *) malloc(NRA*NCB*sizeof(double)); /* Result matrix for the sequential algorithm */ res = (double **) malloc(NRA*sizeof(double *)); res_block = (double *) malloc(NRA*NCB*sizeof(double)); for (i=0; i<NRA; i++) /* Initialize pointers to a */ a[i] = a_block+i*NRA; for (i=0; i<NCA; i++) /* Initialize pointers to b */ b[i] = b_block+i*NCA; for (i=0; i<NRA; i++) /* Initialize pointers to c */ c[i] = c_block+i*NRA; for (i=0; i<NRA; i++) /* Initialize pointers to res */ res[i] = res_block+i*NRA; /* A static allocation of the matrices would be done like this */ /* double a[NRA][NCA], b[NCA][NCB], c[NRA][NCB]; */ /*** Spawn a parallel region explicitly scoping all variables ***/ #pragma omp parallel shared(a,b,c,nthreads) private(tid,i,j,k) num_threads(NR_THREADS) { tid = omp_get_thread_num(); if (tid == 0) { /* Only thread 0 prints */ nthreads = omp_get_num_threads(); printf("Starting matrix multiplication with %d threads\n",nthreads); printf("Initializing matrices...\n"); } /*** Initialize matrices ***/ #pragma omp for nowait /* No need to synchronize the threads before the */ for (i=0; i<NRA; i++) /* last matrix has been initialized */ for (j=0; j<NCA; j++) a[i][j]= (double) (i+j); #pragma omp for nowait for (i=0; i<NCA; i++) for (j=0; j<NCB; j++) b[i][j]= (double) (i*j); #pragma omp for /* We synchronize the threads after this */ for (i=0; i<NRA; i++) for (j=0; j<NCB; j++) c[i][j]= 0.0; if (tid == 0) /* Thread zero measures time */ starttime = omp_get_wtime(); /* Master thread measures the execution time */ /* Do matrix multiply sharing iterations on outer loop */ /* If DEBUG is TRUE display who does which iterations */ /* printf("Thread %d starting matrix multiply...\n",tid); */ #pragma omp for for (i=0; i<NRA; i++) { if (DEBUG) printf("Thread=%d did row=%d\n",tid,i); for(j=0; j<NCB; j++) { for (k=0; k<NCA; k++) { c[i][j] += a[i][k] * b[k][j]; } } } if (tid == 0) { stoptime = omp_get_wtime(); printf("Time for parallel matrix multiplication: %3.2f s\n", stoptime-starttime); } } /*** End of parallel region ***/ starttime = omp_get_wtime(); /* Do a sequential matrix multiplication and compare the results */ for (i=0; i<NRA; i++) { for (j=0; j<NCB; j++) { res[i][j] = 0.0; for (k=0; k<NCA; k++) res[i][j] += a[i][k]*b[k][j]; } } stoptime = omp_get_wtime(); printf("Time for sequential matrix multiplication: %3.2f s\n", stoptime-starttime); /* Check that the results are the same as in the parallel solution. Actually, you should not compare floating point values for equality like this but instead compute the difference between the two values and check that it is smaller than a very small value epsilon. However, since all values in the matrices here are integer values, this will work. */ for (i=0; i<NRA; i++) { for (j=0; j<NCB; j++) { if (res[i][j] == c[i][j]) { /* Everything is OK if they are equal */ } else { printf("Different result %5.1f != %5.1f in %d %d\n ", res[i][j], c[i][j], i, j); } } } /* If DEBUG is true, print the results. Usa smaller matrices for this */ if (DEBUG) { printf("Result Matrix:\n"); for (i=0; i<NRA; i++) { for (j=0; j<NCB; j++) printf("%6.1f ", c[i][j]); printf("\n"); } } printf ("Done.\n"); exit(0); }
int main(int argc, char* argv[]) { int i, j; double start, time1, time2; int M = MM; int N = NN; int P = PP; if (argc != 4) { printf("Suggested Usage: %s <M> <N> <P> \n", argv[0]); printf("Using default values\n"); } else { M = atoi(argv[1]); N = atoi(argv[2]); P = atoi(argv[3]); } double **A = Allocate2DArray< double >(M, P); double **B = Allocate2DArray< double >(P, N); double **C1 = Allocate2DArray< double >(M, N); double **C4 = Allocate2DArray< double >(M, N); for (i = 0; i < M; i++) { for (j = 0; j < P; j++) { A[i][j] = (double)(rand()%100) / 10.0; } } for (i = 0; i < P; i++) { for (j = 0; j < N; j++) { B[i][j] = (double)(rand()%100) / 10.0; } } printf("Matrix Dimensions: M = %d P = %d N = %d\n\n", M, P, N); printf("Execute matmult1\n"); start = omp_get_wtime(); matmult1(M, N, P, A, B, C1); time1 = omp_get_wtime() - start; printf("Time = %f seconds\n\n",time1); printf("Execute matmultr\n"); start = omp_get_wtime(); matmultr(M, N, P, A, B, C4); time2 = omp_get_wtime() - start; printf("Time = %f seconds\n\n",time2); printf("Checking..."); if (CheckResults(M, N, C1, C4)) printf("Error in Recursive Matrix Multiplication\n\n"); else { printf("OKAY\n\n"); printf("Speedup = %5.1fX\n", time1/time2); } Free2DArray< double >(A); Free2DArray< double >(B); Free2DArray< double >(C1); Free2DArray< double >(C4); return 0; }
double timer (void) { return omp_get_wtime (); }