int main(int argc, char *argv[]) { // export OMP_NUM_THREADS=1 float **A, **B, **C; // matrices int d1, d2, d3; // dimensions of matrices int i, j, k; // loop variables double start, end; start = omp_get_wtime(); /* print user instruction */ if (argc != 4) { printf ("Matrix multiplication: C = A x B\n"); printf ("Usage: %s <NumRowA> <NumColA> <NumColB>\n", argv[0]); return 0; } /* read user input */ d1 = atoi(argv[1]); // rows of A and C d2 = atoi(argv[2]); // cols of A and rows of B d3 = atoi(argv[3]); // cols of B and C printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n", d1, d3, d1, d2, d2, d3); /* prepare matrices */ A = alloc_mat(d1, d2); init_mat(A, d1, d2); B = alloc_mat(d2, d3); init_mat(B, d2, d3); C = alloc_mat(d1, d3); // no initialisation of C, because it gets filled by matmult /* serial version of matmult */ printf("Perform matrix multiplication...\n"); /* spezielle collapse-Schleife, um über ein mehrdimensionales Array zu iterieren Schleifen müssen sehr einfach gehalten sein, damit Parallelisierung erfolgen kann Alle Schleifenvariablen müssen völlig unabhängig voneinander sein. Sind es aber nicht, die Ausgabe war Fehlerhaft. */ double sum; // #pragma omp parallel for collapse(3) schedule(dynamik) for (i = 0; i < d1; i++) for (j = 0; j < d3; j++) #pragma omp parallel for private(sum)// Rechenintensive Operation wird parallelisiert. for (k = 0; k < d2; k++) { // Nur hier darf beliebiger Code stehen! wenn collaps verwendet würde sum = A[i][k] * B[k][j]; #pragma omp atomic C[i][j] += sum; } /* test output */ print_mat(A, d1, d2, "A"); print_mat(B, d2, d3, "B"); print_mat(C, d1, d3, "C"); printf ("\nDone.\n"); end = omp_get_wtime(); printf("This task took %f seconds\n", end-start); return 0; }
int main(int argc, char **argv){ MPI_Init(argc, argv); MPI_Barrier(MPI_COMM_WORLD); int nb; int rank = -1; double t,start,stop; double* mat_A; double* mat_B; double* mat_res = alloc_mat(); // Allocations mat_A = init_mat(); mat_B = init_mat(); // Init printf("Nb.threads\tTps.\n"); for(nb=1;nb<=PROC;nb++){ MPI_Comm_size(MPI_COMM_WORLD, &nb); MPI_Comm_rank(MPI_COMM_WORLD, &rank); start = get_time(); multiply_mat(mat_A, mat_B, mat_res); stop=get_time(); t=stop-start; printf("%d\t%f\n",nb,t); } // Memory free free(mat_A); free(mat_B); free(mat_res); return EXIT_SUCCESS; MPI_Finalize(void); }
int main(int argc, char *argv[]) { float **A, **B, **C; // matrices int d1, d2, d3; // dimensions of matrices int i, j, k; // loop variables /* print user instruction */ if (argc != 4) { printf ("Matrix multiplication: C = A x B\n"); printf ("Usage: %s <NumRowA> <NumColA> <NumColB>\n", argv[0]); return 0; } /* read user input */ d1 = atoi(argv[1]); // rows of A and C d2 = atoi(argv[2]); // cols of A and rows of B d3 = atoi(argv[3]); // cols of B and C printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n", d1, d3, d1, d2, d2, d3); /* prepare matrices */ A = alloc_mat(d1, d2); init_mat(A, d1, d2); B = alloc_mat(d2, d3); init_mat(B, d2, d3); C = alloc_mat(d1, d3); // no initialisation of C, because it gets filled by matmult /* serial version of matmult */ printf("Perform matrix multiplication...\n"); for (i = 0; i < d1; i++) for (j = 0; j < d3; j++) for (k = 0; k < d2; k++) C[i][j] += A[i][k] * B[k][j]; /* test output */ print_mat(A, d1, d2, "A"); print_mat(B, d2, d3, "B"); print_mat(C, d1, d3, "C"); printf ("\nDone.\n"); return 0; }
double* init_mat(){ double* m = alloc_mat(); int i, j; for(i = 0; i < SIZE; i++){ for(j = 0; j < SIZE; j++){ m[i*SIZE + j] = (double)rand()/(double)RAND_MAX; } } return m; }
/** **/ int main (int argc, char* argv[]) { int WORK_DIM = 2; // Wie viele Dimensionen hat der Indexraum? std::chrono::time_point<std::chrono::system_clock> s_start, s_end, p_start, p_end; // Lese den Kernel dynamisch ein: (uebernommen von Foliensatz 9, Folie 20) FILE *fp; const char *FileName = "matmult.cl"; char *KernelSource; fp = fopen(FileName, "r"); if (!fp) { printf("Can't open kernel source: %s", FileName); exit(1); } KernelSource = (char *)malloc(MAX_SOURCE_SIZE); size_t kernel_s_size = fread(KernelSource, 1, MAX_SOURCE_SIZE, fp); fclose(fp); cl_int err; cl_platform_id* platforms = NULL; char platform_name[1024]; cl_device_id device_id = NULL; cl_uint num_of_platforms = 0, num_of_devices = 0; cl_context context; cl_kernel kernel; cl_command_queue command_queue; cl_program program; err = clGetPlatformIDs(0, NULL, &num_of_platforms); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } // Liefert Plattformen platforms = (cl_platform_id *)malloc(num_of_platforms); err = clGetPlatformIDs(num_of_platforms, platforms, NULL); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } else { int nvidia_platform = 0; // Speichert den Rang der letzten NVIDIA-Plattform for (unsigned int i=0; i<num_of_platforms; i++) // Fuer jede Plattform: { clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (err != CL_SUCCESS) { printf("Could not get information about platform. Error: %d\n", err); return 0; } if (strstr(platform_name, "NVIDIA") != NULL) { // Falls die Plattform eine NVIDIA-Plattform ist: Speichere ihren Rang nvidia_platform = i; break; } } // Gibt die ID des Devices der NVIDIA-Plattform zurueck err = clGetDeviceIDs(platforms[nvidia_platform], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices); if (err != CL_SUCCESS) { printf("Could not get device in platform. Error: %d\n", err); return 0; } } // Erschaffe einen OpenCl-context, in dem OpenCl-Datenobjekte verwaltet werden koennen context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Unable to create context. Error: %d\n", err); return 0; } // Initialisiere eine Befehlswarteschleife, die Befehle fuer OpenCl-Objekte speichern kann command_queue = clCreateCommandQueue(context, device_id, 0, &err); if (err != CL_SUCCESS) { printf("Unable to create command queue. Error: %d\n", err); return 0; } // Initialisiere ein Programm und spezifiziere, aus welchem Code dieses kompiliert werden soll program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, (const size_t *)& kernel_s_size, &err); if (err != CL_SUCCESS) { printf("Unable to create program. Error: %d\n", err); return 0; } // Kompiliere das Programm zur Laufzeit err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { // Zeige Compilermeldungen an: (uebernommen von Foliensatz 9, Folie 23) char *log; size_t size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); log = (char *)malloc(size+1); if (log) { clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); log[size] = '\0'; printf("%s", log); free(log); } printf("Error building program. Error: %d\n", err); return 0; } // Erschaffe einen Kernel und lade oben kompiliertes Programm ein kernel = clCreateKernel(program, "matmult", &err); if (err != CL_SUCCESS) { printf("Error setting kernel. Error: %d\n", err); return 0; } float **A, **B, **C; // Matrizen int dim1, dim2, dim3; // Matrixdimensionen dim1 = D1; // Zeilen von A, Zeilen von C dim2 = D2; // Spalten von A, Zeilen von B dim3 = D3; // Spalten von B, Spalten von C A = alloc_mat(dim1, dim2); B = alloc_mat(dim2, dim3); C = alloc_mat(dim1, dim3); init_mat(A, dim1, dim2); init_mat(B, dim2, dim3); cl_mem in_A, in_B, output; // float data[DATA_SIZE] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; size_t global[1] = {dim1*dim3}; // Dimensionen von C size_t global_two[2] = {dim1, dim3}; in_A = clCreateBuffer (context, CL_MEM_READ_ONLY, sizeof(float)*dim1*dim2, NULL, &err); in_B = clCreateBuffer (context, CL_MEM_READ_ONLY, sizeof(float)*dim2*dim3, NULL, &err); output = clCreateBuffer (context, CL_MEM_WRITE_ONLY, sizeof(float)*dim1*dim3, NULL, &err); clEnqueueWriteBuffer(command_queue, in_A, CL_TRUE, 0, sizeof(float)*dim1*dim2, *A, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, in_B, CL_TRUE, 0, sizeof(float)*dim2*dim3, *B, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_A); clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_B); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); // clSetKernelArg(kernel, 3, sizeof(int), &dim2); // clSetKernelArg(kernel, 4, sizeof(int), &dim3); clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL); if (WORK_DIM == 2) { clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_two, NULL, 0, NULL, NULL); } // Zeitmessung fuer parallele Version p_start = std::chrono::system_clock::now(); err = clFinish(command_queue); p_end = std::chrono::system_clock::now(); std::chrono::duration<double> p_duration = p_end - p_start; if (err == CL_INVALID_COMMAND_QUEUE ) { printf("CL_INVALID_COMMAND_QUEUE: %d\n", err); return 0; } clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float)*dim1*dim3, *C, 0, NULL, NULL); // Ueberpruefe, ob serielle Version und parallele gleich sind: float **correct_matrix; correct_matrix = alloc_mat(dim1, dim3); s_start = std::chrono::system_clock::now(); // Zeitmessung fuer serielle Version correct_matrix = mult_mat(A, B, dim1, dim2, dim3); s_end = std::chrono::system_clock::now(); std::chrono::duration<double> s_duration = s_end - s_start; is_correct(C, correct_matrix, dim1, dim3); // Numerischer Korrektheitsbeweis print_mat(C, dim1, dim3, "C = "); print_mat(correct_matrix, dim1, dim3, "correct_matrix = "); // printf("Kernel execution time: %f\n", t_end-t_start); clReleaseMemObject(in_A); clReleaseMemObject(in_B); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); err = clReleaseCommandQueue(command_queue); //!! if (err != CL_SUCCESS) { printf("Error releasing command queue: %d\n", err); return 0; } clReleaseContext(context); printf("Dauer der seriellen Version: %.2f Millisekunden\n", s_duration.count() * 1000); printf("Dauer der parallelen Version: %.2f Millisekunden\n", p_duration.count() * 1000); printf("Erhaltenes Speed Up: %.2f \n", p_duration.count() / p_duration.count()); return 0; }
int main(int argc, char** argv) { double serial_time, openCL_time, start_time; cl_int err; cl_platform_id* platforms = NULL; char platform_name[1024]; cl_device_id device_id = NULL; cl_uint num_of_platforms = 0; cl_uint num_of_devices = 0; cl_context context; cl_kernel kernel; cl_command_queue command_queue; cl_program program; cl_mem input1, input2, input3, output; float **A, **B, **C, **serialC; // matrices int d1, d2, d3; // dimensions of matrices /* print user instruction */ if (argc != 4) { printf("Matrix multiplication: C = A x B\n"); printf("Usage: %s <NumRowA> <NumColA> <NumColB>\n", argv[0]); return 0; } /* read user input */ d1 = 1000; // rows of A and C d2 = 1000; // cols of A and rows of B d3 = 1000; // cols of B and C int d[4] = { 0, d1, d2, d3 }; size_t global[1] = { (size_t)d1*d3 }; printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n", d1, d3, d1, d2, d2, d3); /* prepare matrices */ A = alloc_mat(d1, d2); init_mat(A, d1, d2); B = alloc_mat(d2, d3); init_mat(B, d2, d3); C = alloc_mat(d1, d3); serialC = alloc_mat(d1, d3); err = clGetPlatformIDs(0, NULL, &num_of_platforms); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } platforms = (cl_platform_id *)malloc(num_of_platforms); err = clGetPlatformIDs(num_of_platforms, platforms, NULL); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } else { int nvidia_platform = 0; for (unsigned int i = 0; i<num_of_platforms; i++) { clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (err != CL_SUCCESS) { printf("Could not get information about platform. Error: %d\n", err); return 0; } if (strstr(platform_name, "NVIDIA") != NULL) { nvidia_platform = i; break; } } err = clGetDeviceIDs(platforms[nvidia_platform], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices); if (err != CL_SUCCESS) { printf("Could not get device in platform. Error: %d\n", err); return 0; } } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Unable to create context. Error: %d\n", err); return 0; } command_queue = clCreateCommandQueue(context, device_id, 0, &err); if (err != CL_SUCCESS) { printf("Unable to create command queue. Error: %d\n", err); return 0; } program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (err != CL_SUCCESS) { printf("Unable to create program. Error: %d\n", err); return 0; } if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) { char *log; size_t size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); // 1. Länge des Logbuches? log = (char *)malloc(size + 1); if (log) { clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); // 2. Hole das Logbuch ab log[size] = '\0'; printf("%s", log); free(log); } return 1; } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { printf("Error building program. Error: %d\n", err); return 0; } kernel = clCreateKernel(program, "matmult_ocl", &err); if (err != CL_SUCCESS) { printf("Error setting kernel. Error: %d\n", err); return 0; } input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, d1*d2*sizeof(float), NULL, &err); input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, d2*d3*sizeof(float), NULL, &err); input3 = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(int), NULL, &err); output = clCreateBuffer(context, CL_MEM_READ_WRITE, d1*d3*sizeof(float), NULL, &err); start_time = omp_get_wtime(); clEnqueueWriteBuffer(command_queue, input1, CL_TRUE, 0, d1*d2*sizeof(float), *A, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, input2, CL_TRUE, 0, d2*d3*sizeof(float), *B, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, input3, CL_TRUE, 0, 4 * sizeof(int), d, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1); clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input3); clSetKernelArg(kernel, 3, sizeof(cl_mem), &output); clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL); clFinish(command_queue); clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, d1*d3*sizeof(float), *C, 0, NULL, NULL); // for (unsigned int i = 0; i < (unsigned int) d1*d3; i++) // printf("%f\n", C[0][i]); openCL_time = omp_get_wtime() - start_time; clReleaseMemObject(input1); clReleaseMemObject(input2); clReleaseMemObject(input3); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(command_queue); clReleaseContext(context); printf("Running serial algorithm...\n"); start_time = omp_get_wtime(); serialC = mult_mat(A, B, d1, d2, d3); serial_time = omp_get_wtime() - start_time; printf("Checking results... "); is_correct(C, serialC, d1, d3); printf("Showing stats...\n"); printf(" serial runtime = %f\n", serial_time); printf(" OpenCL runtime = %f\n", openCL_time); printf(" Speedup = %f\n", serial_time / openCL_time); return 0; }