void c_fft2d_serial(complex inp[N][N]) { int i, j; for (i = 0; i < N; i++) { c_fft1d(&inp[i][0], N, -1); } transpose(inp); for (i = 0; i < N; i++) { c_fft1d(&inp[i][0], N, -1); } transpose(inp); }
void execute_fft(complex data[512*512], int type, int p, int my_rank) { int i, j, workload; workload = 512 / p; for(i = 0; i < workload; i++) { c_fft1d(&data[(my_rank * workload * 512) + (512 * i)], 512, type); } }
void c_rowwise_inv_fft2d(complex *local_mat, int local_n) { int i; for (i = 0; i < local_n; i++) { c_fft1d((local_mat+N*i), N, 1); } }
void convolution(int my_id, int p){ int i, j, k; int chunkSize; double start, end; double time[14]; /* Input data */ float input_1[N][N], input_2[N][N]; /* Output data */ float output[N][N]; /* Set the chunk size for each processor */ chunkSize = N/p; /* Two arrays storing the local data distributed by rank 0 */ float local_data1[N][N], local_data2[N][N]; /* Local matrix for matrix multiplication */ float local_data3[chunkSize][N]; /* A complex array storing the temp row to operate FFT */ complex temp_data[N]; /* Initialization of the original Matrix and distribution of data */ if(my_id == 0){ printf("2D convolution using SPMD model and MPI Collective operations\n"); start = MPI_Wtime(); /*Read data from the files*/ readFile(input_1, input_2); time[0] = MPI_Wtime(); printf("Reading file takes %f s.\n", time[0] - start); } /* Scatter all the data to local data */ MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Scatter(input_2, chunkSize*N, MPI_FLOAT, local_data2, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); /* Compute time for distributing data */ if(my_id == 0){ time[1] = MPI_Wtime(); printf("Scattering data of rows to each processor takes %f s.\n", time[1] - time[0]); } /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data1[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data1[i][j] = temp_data[j].r; for(j = 0; j < N; j++){ /* FFT each row for im2 */ temp_data[j].r = local_data2[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data2[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(my_id == 0){ time[2] = MPI_Wtime(); printf("FFT each row for input im1 and im2 takes %f s.\n", time[2] - time[1]); } MPI_Gather(local_data1, chunkSize*N, MPI_FLOAT, input_1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT, input_2, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); if(my_id == 0){ time[3] = MPI_Wtime(); printf("Gathering all the data from different rows takes %f s.\n", time[3] - time[2]); } /* Initialize a new vector for distributing columns */ MPI_Datatype column, col; /* Column vector */ MPI_Type_vector(N, 1, N, MPI_FLOAT, &col); MPI_Type_commit(&col); MPI_Type_create_resized(col, 0, 1*sizeof(float), &column); MPI_Type_commit(&column); /* Scatter all the data to column local data */ MPI_Scatter(input_1, chunkSize, column, local_data1, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Scatter(input_2, chunkSize, column, local_data2, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[4] = MPI_Wtime(); printf("Scattering data of columns to each processor takes %f s.\n", time[4] - time[3]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data1[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data1[j][i] = temp_data[j].r; for(j = 0; j < N; j++){ /* FFT each column for im2 */ temp_data[j].r = local_data2[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data2[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(my_id == 0){ time[5] = MPI_Wtime(); printf("FFT each column for input im1 and im2 takes %f s.\n", time[5] - time[4]); } MPI_Gather(local_data1, chunkSize, column, input_1, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Gather(local_data2, chunkSize, column, input_2, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); /* Compute time and distribute data to do matrix multiplication */ if(my_id == 0){ time[6] = MPI_Wtime(); printf("Gathering all the data from different columns takes %f s.\n", time[6] - time[5]); } MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); /* Broadcast data2 to all the ranks */ MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[7] = MPI_Wtime(); printf("Scattering data for multiplication takes %f s.\n", time[7] - time[6]); } /* Matrix multiplication */ for(i = 0; i < chunkSize; i++) for(j = 0; j < N; j++) for(k = 0; k < N; k++) local_data3[i][j] += local_data1[i][k]*input_2[k][j]; /* Collect multiplication results from each rank */ if(my_id == 0){ time[8] = MPI_Wtime(); printf("Matrix multiplication takes %f s.\n", time[8] - time[7]); } /* Inverse-2DFFT(row) for the output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data3[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data3[i][j] = temp_data[j].r; } if(my_id == 0){ time[9] = MPI_Wtime(); printf("Inverse-2DFFT for out_1(row) takes %f s.\n", time[9] - time[8]); } MPI_Gather(local_data3, chunkSize*N, MPI_FLOAT, output, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[10] = MPI_Wtime(); printf("Gathering all the data of Inverse-2DFFT for out_1(row) takes %f s.\n", time[10] - time[9]); } MPI_Scatter(output, chunkSize, column, local_data1, chunkSize, column, 0, MPI_COMM_WORLD); if(my_id == 0){ time[11] = MPI_Wtime(); printf("Scattering out_1(column) to each processor takes %f s.\n", time[11] - time[10]); } /* Inverse-2DFFT(column) for the output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data1[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data1[j][i] = temp_data[j].r; } /* Gathering all the columns of the output file from each rank */ if(my_id == 0){ time[12] = MPI_Wtime(); printf("Inverse-2DFFT out_1(column) takes %f s.\n", time[12] - time[11]); } MPI_Gather(local_data1, chunkSize, column, output, chunkSize, column, 0, MPI_COMM_WORLD); if(my_id == 0){ time[13] = MPI_Wtime(); printf("Gathering all the data of the output file(column) takes %f s.\n", time[13] - time[12]); writeFile(output); end = MPI_Wtime(); printf("Writing the output file to file takes %f s.\n", end - time[13]); printf("Total communication time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[12] + time[11] - time[10] + time[7] - time[5] + time[4] - time[2] + time[1] - time[0]); printf("Total computing time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[12] - time[11] + time[10] - time[7] + time[5] - time[4] + time[2] - time[1]); printf("Total running time without loading/writing of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[0]); printf("Total running time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", end - start); } /* Free vector column */ MPI_Type_free(&column); MPI_Type_free(&col); }
int main(int argc, char **argv) { int i, j, k; double start, end; /* Time array */ double time[9]; double comm_time = 0; double comp_time = 0; int chunkSize; MPI_Status status; /* Being used in FFT */ float data[N][N]; /* Being used in mm */ float input_1[N][N], input_2[N][N]; /* Local matrix for FFT */ float local_data[N][N]; /* World rank and processor, related to MPI_COMM_WORLD */ int world_id; int world_processor; /* Divided rank and processors for communication, related to taskcomm */ int task_id; int task_processor; /* A complex array storing the temp row to operate FFT */ complex temp_data[N]; /* Initialize rank and the number of processor for the MPI */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &world_id); MPI_Comm_size(MPI_COMM_WORLD, &world_processor); /* Initialize a new vector for distributing columns */ MPI_Datatype column, col; /* Column vector */ MPI_Type_vector(N, 1, N, MPI_FLOAT, &col); MPI_Type_commit(&col); MPI_Type_create_resized(col, 0, 1*sizeof(float), &column); MPI_Type_commit(&column); int task = world_id%4; MPI_Comm taskcomm; /* Split the MPI_COMM_WORLD */ MPI_Comm_split(MPI_COMM_WORLD, task, world_id, &taskcomm); MPI_Comm_rank(taskcomm, &task_id); MPI_Comm_size(taskcomm, &task_processor); /* Initialize inter communicators */ MPI_Comm t1_t3_comm, t2_t3_comm, t3_t4_comm; /* Calculate chunkSize */ chunkSize = N/task_processor; /* Get the start time of all program */ if(world_id == 0){ printf("2D convolution using MPI task and data parallelism\n"); start = MPI_Wtime(); } /* Each group completes work and send results by inter communicators */ if(task == 0){ // task 1 /* Create an inter communicator for task 1 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 1, &t1_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm1File(data); time[1] = MPI_Wtime(); printf("Group 1: Reading file 1_im1 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 1: FFT each row for 1_im1 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 1: FFT each column for 1_im1 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 1: Total time for task 1 in group 1 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm); } } else if(task == 1){ // Task 2 /* Create an inter communicator for task 2 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 2, &t2_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm2File(data); time[1] = MPI_Wtime(); printf("Group 2: Reading file 1_im2 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 2: FFT each row for 1_im2 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 2: FFT each column for 1_im2 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 2: Total time for task 2 in group 2 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm); } } else if(task == 2){ // Task 3 /* Local matrix for matrix multiplication */ float local_data2[chunkSize][N]; /* Create inter communicators for task 1 and task3, task 2 and task 3, task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 0, 1, &t1_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 1, 2, &t2_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 3, 3, &t3_t4_comm); /* Receive data from group 1 and group 2 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(input_1, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm, &status); MPI_Recv(input_2, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm, &status); time[1] = MPI_Wtime(); /* Time of receiving data from group 1 and group 2 */ printf("Group 3: Receiving data from group 1 and group 2 takes %f s.\n", time[1] - time[0]); } /* Do matrix multiplication */ MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Broadcast data2 to all the ranks */ MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 3: Scattering data for multiplication takes %f s.\n", time[2] - time[1]); } /* Matrix multiplication */ for(i = 0; i < chunkSize; i++) for(j = 0; j < N; j++){ local_data2[i][j] = 0; for(k = 0; k < N; k++) local_data2[i][j] += local_data[i][k]*input_2[k][j]; } /* Collect multiplication result from each rank */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 3: Matrix multiplication takes %f s.\n", time[3] - time[2]); } /* Gather data */ MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 3: Gathering data after Matrix multiplication takes %f s.\n", time[4] - time[3]); /* total time */ printf("Group 3: Total time for task 3 in group 3 takes %f s.\n", time[4] - time[0]); /* send result of matrix multiplication to group 4 */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm); } comm_time += time[4] - time[3] + time[2] - time[0]; comp_time += time[3] - time[2]; MPI_Comm_free(&t1_t3_comm); MPI_Comm_free(&t2_t3_comm); } else{ // Task 4 /* Create an inter communicator for task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 3, &t3_t4_comm); /* Receive data from group 3 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm, &status); time[1] = MPI_Wtime(); printf("Group 4: Receiving data from group 3 takes %f s.\n", time[1] - time[0]); } /* Scatter data to each processor */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 4: Scattering data of rows to each processor takes %f s.\n", time[2] - time[1]); } /* Inverse-2DFFT(row) */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(row) takes %f s.\n", time[3] - time[2]); } /* Gather all the data */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(row) takes %f s.\n", time[4] - time[3]); } MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 4: Scattering data of columns to each processor takes %f s.\n", time[5] - time[4]); } /* Inverse-2DFFT(column) for output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(column) takes %f s.\n", time[6] - time[5]); } /* Gather all the columns of output file from each rank */ MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(column) takes %f s.\n", time[7] - time[6]); writeFile(data); time[8] = MPI_Wtime(); printf("Group 4: Writing file to out_1 takes %f s.\n", time[8] - time[7]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[0]; comp_time += time[6] - time[5] + time[3] - time[2]; } MPI_Comm_free(&t3_t4_comm); } MPI_Barrier(MPI_COMM_WORLD); if(world_id == 0){ end = MPI_Wtime(); printf("Total communication time of 2D convolution using MPI task parallel takes %f s.\n", comm_time); printf("Total computing time of 2D convolution using MPI task parallel takes %f s.\n", comp_time); printf("Total running time without loading/writing of 2D convolution using MPI task parallel takes %f s.\n", comm_time + comp_time); printf("Total running time of 2D convolution using MPI task parallel takes %f s.\n", end - start); } /* Free vector and task comm */ MPI_Type_free(&column); MPI_Type_free(&col); MPI_Comm_free(&taskcomm); MPI_Finalize(); return 0; }
void execute_fft(complex data[512*512], int type) { int i, j; for(i = 0; i < 512; i++) { c_fft1d(&data[512 * i], 512, type); } }