int main(int argc, char **argv) { /* Timing variables */ struct timeval etstart, etstop; /* Elapsed times using gettimeofday() */ struct timezone tzdummy; clock_t etstart2, etstop2; /* Elapsed times using times() */ unsigned long long usecstart, usecstop; struct tms cputstart, cputstop; /* CPU times for my processes */ /* Process program parameters */ parameters(argc, argv); /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); /* Start Clock */ printf("\nStarting clock.\n"); gettimeofday(&etstart, &tzdummy); etstart2 = times(&cputstart); /* Gaussian Elimination */ gauss(); /* Stop Clock */ gettimeofday(&etstop, &tzdummy); etstop2 = times(&cputstop); printf("Stopped clock.\n"); usecstart = (unsigned long long)etstart.tv_sec * 1000000 + etstart.tv_usec; usecstop = (unsigned long long)etstop.tv_sec * 1000000 + etstop.tv_usec; /* Display output */ print_X(); /* Display timing results */ printf("\nElapsed time = %g ms.\n", (float)(usecstop - usecstart)/(float)1000); printf("(CPU times are accurate to the nearest %g ms)\n", 1.0/(float)CLOCKS_PER_SEC * 1000.0); printf("My total CPU time for parent = %g ms.\n", (float)( (cputstop.tms_utime + cputstop.tms_stime) - (cputstart.tms_utime + cputstart.tms_stime) ) / (float)CLOCKS_PER_SEC * 1000); printf("My system CPU time for parent = %g ms.\n", (float)(cputstop.tms_stime - cputstart.tms_stime) / (float)CLOCKS_PER_SEC * 1000); printf("My total CPU time for child processes = %g ms.\n", (float)( (cputstop.tms_cutime + cputstop.tms_cstime) - (cputstart.tms_cutime + cputstart.tms_cstime) ) / (float)CLOCKS_PER_SEC * 1000); /* Contrary to the man pages, this appears not to include the parent */ printf("--------------------------------------------\n"); exit(0); }
int main(int argc, char **argv) { // /* Timing variables */ // struct timeval etstart, etstop; /* Elapsed times using gettimeofday() */ // struct timezone tzdummy; // clock_t etstart2, etstop2; /* Elapsed times using times() */ // unsigned long long usecstart, usecstop; // struct tms cputstart, cputstop; /* CPU times for my processes */ argc--; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &id); MPI_Comm_size(MPI_COMM_WORLD, &procs); /* Process program parameters */ parameters(argc, argv); if(id == 0) { /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); } /* Gaussian Elimination */ gauss(); // if(id == 0) { // /* Display output */ // print_X(); // // gauss_test(); // // /* Compare the result*/ // // int right = 1; // // int j = 0; // // for(; j < N; j++) { // // float dif = X[j] - X1[j]; // // if (dif < 0) dif = -dif; // // if (dif > 0.0001) { // // printf("X: %f\n", X[j]); // // printf("X1: %f\n", X1[j]); // // right = 0; // // break; // // } // // } // printf("right: %d\n",right); // if(right == 1) printf("\nRight!\n"); // else printf("\nWrong!\n"); // } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { parameters(argc, argv); initialize_inputs(); print_inputs(); gauss(); print_X(); exit(0); }
int main(int argc, char **argv) { /* Timing variables */ double start_t; double end_t; /* MPI Variables */ int my_rank; int p; int dest = 0; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &p); if(my_rank == 0) { parameters(argc, argv); } MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); if(my_rank == 0) { float A[N*N], B[N], X[N]; /* Initialize A and B */ initialize_inputs(A, B, X); /* Print input matrices */ print_inputs(A, B); /* Start Clock */ printf("\nStarting clock.\n"); start_t = MPI_Wtime(); gauss(A, B, X, my_rank, p); /* Stop Clock */ end_t = MPI_Wtime(); printf("Stopped clock.\n"); /* Display output */ print_X(X); /* Display timing results */ printf("\nElapsed time = %g s\n", end_t - start_t); printf("--------------------------------------------\n"); } else { workerGauss(my_rank, p); } MPI_Finalize(); }
int main(int argc, char **argv) { /* Prototype functions*/ void gauss(); MPI_Init(&argc, &argv); /* Get my process rank */ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); /* Find out how many processes are being used */ MPI_Comm_size(MPI_COMM_WORLD, &p); printf("\nProcess number %d of %d says hi\n", my_rank+1, p); /* Every process reads the parameters to prepare dimension */ parameters(argc, argv); /* Every process must allocate memory for the arrays */ allocate_memory(); if ( my_rank == SOURCE ) { /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); } /*printf("\nProcess number %d of %d says hi\n", my_rank+1, p);*/ gauss(); if ( my_rank == SOURCE ) { /* Print input matrices */ print_A(); print_B(); print_X(); } /* The barrier prevents any process to reach the finalize before the others have finished their communications */ MPI_Barrier(MPI_COMM_WORLD); /* Free memory used for the arrays that we allocated previously */ free_memory(); MPI_Finalize(); }
int main(int argc, char **argv) { /* Timing variables */ struct timeval etstart, etstop; /* Elapsed times using gettimeofday() */ struct timezone tzdummy; clock_t etstart2, etstop2; /* Elapsed times using times() */ unsigned long long usecstart, usecstop; struct tms cputstart, cputstop; /* CPU times for my processes */ ID = argv[argc-1]; argc--; /* Process program parameters */ parameters(argc, argv); /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); /* Start Clock */ printf("\nStarting clock.\n"); gettimeofday(&etstart, &tzdummy); etstart2 = times(&cputstart); /* Gaussian Elimination */ gauss(); /* Stop Clock */ gettimeofday(&etstop, &tzdummy); etstop2 = times(&cputstop); printf("Stopped clock.\n"); usecstart = (unsigned long long)etstart.tv_sec * 1000000 + etstart.tv_usec; usecstop = (unsigned long long)etstop.tv_sec * 1000000 + etstop.tv_usec; /* Display output */ print_X(); /* Display timing results */ printf("\nElapsed time = %g ms.\n", (float)(usecstop - usecstart)/(float)1000); }
int main(int argc, char **argv) { ID = argv[argc-1]; argc--; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &procs); printf("\nProcess number %d", myid); /* Process program parameters */ parameters(argc, argv); //alocate memory A = (float*)malloc(N*N*sizeof(float)); B = (float*)malloc(N*sizeof(float)); X = (float*)malloc(N*sizeof(float)); /* Initialize A and B */ if (myid == 0) { initialize_inputs(); /* Print input matrices */ print_inputs(); } /* Gaussian Elimination */ gauss(); /* Back substitution */ if (myid == 0) { int row, col; for (row = N - 1; row >= 0; row--) { X[row] = B[row]; for (col = N-1; col > row; col--) { X[row] -= A[row*N + col] * X[col]; } X[row] /= A[row * N + row]; } /* Display output */ print_X(); } free(A); free(B); free(X); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { /* Gaussian Elimination */ int my_rank; /* My process rank */ int p; /* The number of processes */ int norm; /* The number of rows */ /* calculated */ int row; /* Row number */ int col; /* Column number */ int source; /* Process sending integral */ int dest = 0; /* All messages go to 0 */ int tag = 0; MPI_Status status; void Get_data(int my_rank, int p); void Compute(int norm, int my_rank, int p); MPI_Init(&argc, &argv); /* Get my process rank */ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); /* Find out how many processes are being used */ MPI_Comm_size(MPI_COMM_WORLD, &p); /* Timing variables */ double starttime = 0.0; double endtime = 0.0; printf("Computing Parallel via MPI.\n"); if (my_rank == 0) { /* Start Clock */ printf("\nStarting clock.\n"); starttime = MPI_Wtime(); /* Broadcast the value of N to all nodes */ MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); } else /* Receive the broadcast N value */ MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); /* Updating needed data in correspoding row of A and B for each process */ Send_data(my_rank, p); /* Gauss elimination */ for (norm = 0; norm < N - 1; norm++) { int i; for (i = norm; i < N; i++) { MPI_Bcast(A[i], N, MPI_FLOAT, i%p, MPI_COMM_WORLD); MPI_Bcast(&B[i], 1, MPI_FLOAT, i%p, MPI_COMM_WORLD); } Compute(norm, my_rank, p); MPI_Barrier(MPI_COMM_WORLD); } MPI_Bcast(A[N-1], N, MPI_FLOAT, (N-1)%p, MPI_COMM_WORLD); MPI_Bcast(&B[N-1], 1, MPI_FLOAT, (N-1)%p, MPI_COMM_WORLD); if (my_rank == 0) { int row, col; /* Back substitution */ for (row = N - 1; row >= 0; row--) { X[row] = B[row]; for (col = N-1; col > row; col--) { X[row] -= A[row][col] * X[col]; } X[row] /= A[row][row]; } /* Stop Clock */ endtime = MPI_Wtime(); /* Display timing results */ printf("That tooks %f seconds.\n", endtime-starttime); /* Display output */ print_X(); } /* Shut down MPI */ MPI_Finalize(); exit(0); }
// simengine_runmodel() // // executes the model for the given parameters, states and simulation time simengine_result *simengine_runmodel(simengine_opts *opts){ double start_time = opts->start_time; double stop_time = opts->stop_time; unsigned int num_models = opts->num_models; const char *outputs_dirname = opts->outputs_dirname; CDATAFORMAT model_states[PARALLEL_MODELS * NUM_STATES]; unsigned int stateid; unsigned int modelid; unsigned int models_executed; unsigned int models_per_batch; double *progress; int progress_fd; int output_fd; int resuming = 0; int random_initialized = 0; # if defined TARGET_GPU gpu_init(); # endif open_progress_file(outputs_dirname, &progress, &progress_fd, num_models); // Create result structure simengine_result *seresult = (simengine_result*)malloc(sizeof(simengine_result)); // Couldn't allocate return structure, return NULL if(!seresult) return NULL; if(seint.num_states){ seresult->final_states = (double*)malloc(num_models * seint.num_states * sizeof(double)); } else{ seresult->final_states = NULL; } seresult->final_time = (double*)malloc(num_models * sizeof(double)); if((seint.num_states && !seresult->final_states) ||!seresult->final_time){ seresult->status = ERRMEM; seresult->status_message = (char*) simengine_errors[ERRMEM]; seresult->final_states = NULL; seresult->final_time = NULL; return seresult; } init_output_buffers(outputs_dirname, &output_fd); // Run the parallel simulation repeatedly until all requested models have been executed for(models_executed = 0 ; models_executed < num_models; models_executed += PARALLEL_MODELS){ models_per_batch = MIN(num_models - models_executed, PARALLEL_MODELS); // Copy inputs and state initial values to internal representation unsigned int modelid_offset = global_modelid_offset + models_executed; #if NUM_CONSTANT_INPUTS > 0 #if defined TARGET_GPU host_constant_inputs = (CDATAFORMAT *)malloc(PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT)); #else host_constant_inputs = constant_inputs; #endif #else CDATAFORMAT *host_constant_inputs = NULL; #endif #if NUM_SAMPLED_INPUTS > 0 #if defined TARGET_GPU host_sampled_inputs = (sampled_input_t *)malloc(STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t)); #else host_sampled_inputs = sampled_inputs; #endif #else sampled_input_t *host_sampled_inputs = NULL; #endif resuming = initialize_states(model_states, outputs_dirname, num_models, models_per_batch, modelid_offset); initialize_inputs(host_constant_inputs, host_sampled_inputs, outputs_dirname, num_models, models_per_batch, modelid_offset, start_time); #if defined TARGET_GPU && NUM_CONSTANT_INPUTS > 0 CDATAFORMAT *g_constant_inputs; cutilSafeCall(cudaGetSymbolAddress((void **)&g_constant_inputs, constant_inputs)); cutilSafeCall(cudaMemcpy(g_constant_inputs, host_constant_inputs, PARALLEL_MODELS * NUM_CONSTANT_INPUTS * sizeof(CDATAFORMAT), cudaMemcpyHostToDevice)); #endif #if defined TARGET_GPU && NUM_SAMPLED_INPUTS > 0 sampled_input_t *g_sampled_inputs; cutilSafeCall(cudaGetSymbolAddress((void **)&g_sampled_inputs, sampled_inputs)); cutilSafeCall(cudaMemcpy(g_sampled_inputs, host_sampled_inputs, STRUCT_SIZE * NUM_SAMPLED_INPUTS * sizeof(sampled_input_t), cudaMemcpyHostToDevice)); #endif // Initialize the solver properties and internal simulation memory structures solver_props *props = init_solver_props(start_time, stop_time, models_per_batch, model_states, models_executed+global_modelid_offset); // Initialize random number generator if (!random_initialized || opts->seeded) { random_init(models_per_batch); random_initialized = 1; } // If no initial states were passed in if(!resuming){ if(seint.num_states > 0){ // Initialize default states in next_states for(modelid=0;modelid<models_per_batch;modelid++){ init_states(props, modelid); // Copy states from next_states to model_states unsigned int iterid; for(iterid=0;iterid<seint.num_iterators;iterid++){ solver_writeback(&props[iterid], modelid); } } } } // Run the model seresult->status = exec_loop(props, outputs_dirname, progress + models_executed, resuming); seresult->status_message = (char*) simengine_errors[seresult->status]; // Copy the final time from simulation for(modelid=0; modelid<models_per_batch; modelid++){ seresult->final_time[models_executed + modelid] = props->time[modelid]; // Time from the first solver } // Free all internal simulation memory and make sure that model_states has the final state values free_solver_props(props, model_states); // Copy state values back to state initial value structure for(modelid=0; modelid<models_per_batch; modelid++){ for(stateid=0;stateid<seint.num_states;stateid++){ seresult->final_states[AS_IDX(seint.num_states, num_models, stateid, models_executed + modelid)] = model_states[TARGET_IDX(seint.num_states, PARALLEL_MODELS, stateid, modelid)]; } } } close_progress_file(progress, progress_fd, num_models); clean_up_output_buffers(output_fd); return seresult; }
main(int argc, char **argv) { //declare the required data structures int N =32; /* Matrix size */ /* Matrices and vectors */ float *A= malloc(MAXN*MAXN); int i,j; //code commented. was used for testing. /* float temp[64] = {1,2,3,4,5,6,7,8, 2,3,4,1,7,4,5,6, 2,3,2,1,2,2,1,1, 4,5,4,5,5,3,4,2, 1,4,8,4,3,7,6,6, 9,7,7,3,2,8,5,4, 8,6,4,1,1,5,3,3, 8,3,2,6,4,6,9,7}; for(i=0;i<N;i++){ for(j=0;j<N;j++) { *(A+((N*i)+j))=temp[i*N+j]; //printf(" %f",*(A+((8*i)+j))); } //printf("\n"); } */ float B[MAXN];// = {5,6,7,3,5,2,9,5}; float X[MAXN];// = {0,0,0,0,0,0,0,0}; int my_rank=0; /* My process rank */ int p; /* The number of processes */ //clock time recording variables double start_time,end_time=0.0; ///////////////////MPI code starts//////////////////// //status variable used to check status of communication operation. MPI_Status status; /* Let the system do what it needs to start up MPI */ MPI_Init(&argc, &argv); /* Get my process rank */ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); /* Find out how many processes are being used */ MPI_Comm_size(MPI_COMM_WORLD, &p); if(my_rank==0) { /* Process program parameters */ N = parameters(argc, argv); /* Initialize A and B */ initialize_inputs(A, B, X,N); /* Print input matrices */ print_inputs(A, B,N); //Start clock and record the start time. start_time = MPI_Wtime(); } //broadcast the size of the matrix read by the to all processes. MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); //we need all processes to wait here until all others arrive. //we need to make sure that the input matrix has been initialized //by process 0 and the marix size has been propogated to all processes. MPI_Barrier(MPI_COMM_WORLD); //declare the local variables int local_no_of_rows; //number of rows to be processesd by each process int local_matrix_size; //size of the matrix float local_norm_row[N]; //the current normaization row float local_matrix_A[N][N]; //the part of A matrix on which each process will work float local_matrix_B[N]; //the part of B matrix on which each process will work int rows_per_process[p]; //the number of rows distributed to each process float local_norm_B; //the element on which B will be normalized int displ[p]; //displacement variable int norm=0; //the index of the current normalizing row //lets begin. The loop is outermost loop of Gaussian elimination operation. for (norm = 0; norm < N - 1; norm++) { //lets scatter the data accross all processes. //This method scatters the matrix A, and broadcasts the current normalizing row, // number of rows each process will work on. scatter_data(norm, my_rank, p, A, N, &local_no_of_rows, &local_matrix_size, local_norm_row, &(local_matrix_A[0][0]), &rows_per_process[0]); //lets calculate the send counts and displacement vector for scatter of B matrix. if(my_rank==0) { //printf(" %d", *(rows_per_process)); *(displ)=0; for(j=1;j<p;j++) { *(displ+j) = rows_per_process[j-1]+ *(displ+j-1); //printf(" %d", *(rows_per_process+j)); } } //This method call scatter the matrix B. Different processes may have different //number of elements to work on, when the size of matrix is not completely divisible //by number of processes. Hence we have used MPI_Scatterv(), instead of MPI_Scatter MPI_Scatterv(B+norm+1, rows_per_process, displ, MPI_FLOAT,local_matrix_B,local_no_of_rows, MPI_FLOAT, 0, MPI_COMM_WORLD); //lets broadcast the element against which matrix B will be normalized. local_norm_B = B[norm]; MPI_Bcast(&local_norm_B, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); //each process performs the following elimination operation on their //share of the matrix A and B. eliminate(local_matrix_size, local_no_of_rows, &local_norm_row[0], &(local_matrix_A[0][0]), norm, &(local_matrix_B[0]), local_norm_B); //we need to calculate the counts and displacement for the Gather operation //of the processed matrix A, after each iteration. int counts_for_gather[p]; int displacements_for_gather[p]; if(my_rank==0) { *(displacements_for_gather)=0; counts_for_gather[0] = rows_per_process[0]*local_matrix_size; for(j=1;j<p;j++) { counts_for_gather[j] = rows_per_process[j]*local_matrix_size; *(displacements_for_gather+j) = counts_for_gather[j-1]+ *(displacements_for_gather+j-1); } } //here we gather the processed matrix A from all processes and store it locally MPI_Gatherv(local_matrix_A, local_no_of_rows*local_matrix_size, MPI_FLOAT, A+(N*(norm+1)), counts_for_gather, displacements_for_gather, MPI_FLOAT, 0, MPI_COMM_WORLD); //similarly we gather the processed matrix B. MPI_Gatherv(local_matrix_B, local_no_of_rows, MPI_FLOAT, B+norm+1, rows_per_process, displ, MPI_FLOAT, 0, MPI_COMM_WORLD); } //We need to wait for al processes to complete before we go ahead with //back subsitution. MPI_Barrier(MPI_COMM_WORLD); //perform the back substitution operation only by process 0. int row,col; if(my_rank==0){ /* Back substitution */ for (row = N - 1; row >= 0; row--) { X[row] = B[row]; for (col = N-1; col > row; col--) { X[row] -= *(A+(N*row)+col) * X[col]; } X[row] /= *(A+(N*row)+col); } //Stop clock as operation is finished. end_time = MPI_Wtime(); //display X in matrix size is small. if (N < 100) { printf("\nX = ["); for (row = 0; row < N; row++) { printf("%5.2f%s", X[row], (row < N-1) ? "; " : "]\n"); } } //print the execution time for performance analysis purpose. printf("\n\nThe total execution time as recorded on process 0 = %f seconds!!\n!",end_time-start_time); } MPI_Finalize(); }
int main(int argc, char** argv) { int my_rank, size, i; //char msg0[] = "Wasssssaaaaap Ich hasse mein Leif Hundin!\n"; //char msg1[] = "Lol I hate my life\n"; //char rcv0[100], rcv1[100]; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &size); printf("Started Process %d of %d\n", my_rank, (size - 1)); if (my_rank == 0) { printf("Computing in Parallel on %d Processes\n", size); /* Process program parameters */ parameters(argc, argv); /* Initialize A and B */ initialize_inputs(); /* Print input matrices */ print_inputs(); /* Gaussian elimination */ for (norm = 0; norm < N - 1; norm++) /*Proceeding sequentially on each norm row because of *Read-After-Write dependence between each norm variable iteration. */ { i = 0; for (row = norm + 1; row < N; row += blockSize) /*Putting values in the 'inidices' dynamic array described above. *Note that this loop increments with a step size equal to the blockSize value *which is the number of rows each thread will be handling. */ { indices[3 * i] = row; /*First value storing the starting row index.*/ if ((row + blockSize - 1) < N) /*Second value stores the ending row index.*/ indices[3 * i + 1] = row + blockSize - 1; else indices[3 * i + 1] = N - 1; indices[3 * i + 2] = norm; /*Third value stores value of current normalization row index.*/ i++; } numCPU = i; /*Ensures that number of threads launched is equal to the number of proceesing lbocks made.*/ for (i = 0; i < numCPU; i++) { pthread_create(rowThreads + i, NULL, processRows, (indices + 3 * i)); /*Launching each thread to operate on different parts of the array*/ } for (i = 0; i < numCPU; i++) { pthread_join(*(rowThreads + i), NULL); /*Consolidating all threads*/ } } /* (Diagonal elements are not normalized to 1. This is treated in back * substitution.) */ for (i = 1; i < size; i++) { MPI_Send(A, (MAXN*MAXN), MPI_FLOAT, i, 0, MPI_COMM_WORLD); MPI_Send(B, MAXN, MPI_FLOAT, i, 1, MPI_COMM_WORLD); MPI_Send(&N, 1, MPI_INT, i, 2, MPI_COMM_WORLD); printf("Data sent to processor %d!\n", i); } } else { MPI_Recv(A, (MAXN*MAXN), MPI_FLOAT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(B, MAXN, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(&N, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("Received size of data (Value of N) = %d\n", N); printf("Received data with tag 0 & 1\n"); print_inputs(); } MPI_Finalize(); return 0; }