int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix double start_time; // Starting time double run_time; // timing data Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> A(szA); // Host memory for Matrix A std::vector<float> B(szB); // Host memory for Matrix B std::vector<float> C(szC); // Host memory for Matrix C initmat(Mdim, Ndim, Pdim, A, B, C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); float tmp; zero_mat(Ndim, Mdim, C); start_time = wtime(); for (int ii = 0; ii < Ndim; ii++) { for (int jj = 0; jj < Mdim; jj++) { tmp = 0.0f; for (int kk = 0; kk < Pdim; kk++) { /* C(ii,jj) = sum(over kk) A(ii,kk) * B(kk,jj) */ tmp += A[ii*Ndim+kk] * B[kk*Pdim+jj]; } C[ii*Ndim+jj] = tmp; } } run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, C, run_time); return EXIT_SUCCESS; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> A(sz); // Matrix A std::vector<float> B(sz); // Matrix B std::vector<float> C(sz); // Matrix C initmat(N, N, N, A, B, C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += A[i*N+k] * B[k*N+j]; } C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, C, rtime); }
void Calculs::compile_gradient(int nnn) { if(nnn==0) { initEnv(); } Env(); Biomasse(); if(nnn==0) { init_temoin(); } initmat(); construction_dGdU(); construction_dGdX(); construction_dFdU(); construction_dSdX(); construction_dFdX(); construction_adjoints(); construction_gradient(); }
int main(void) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; h_A = (float *)malloc(szA * sizeof(float)); h_B = (float *)malloc(szB * sizeof(float)); h_C = (float *)malloc(szC * sizeof(float)); initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szA, h_A, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szB, h_B, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {Ndim, Mdim}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix double start_time; // Starting time double run_time; // timing data Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, util::loadProgram("../C_elem.cl"), true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row.cl"), true); // Create the compute kernel from the program auto crow_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); crow_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv.cl"), true); // Create the compute kernel from the program auto arowpriv_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); arowpriv_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row pivate, B col local //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_row_priv_bloc.cl"), true); // Create the compute kernel from the program auto browloc_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, mat mult, C row, priv A, B cols loc, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); cl::NDRange global(Ndim); cl::NDRange local(ORDER / 16); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * Pdim); browloc_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... A and B in block form in local memory //-------------------------------------------------------------------------------- // Create the compute program from the source buffer program = cl::Program(context, util::loadProgram("../C_block_form.cl"), true); // Create the compute kernel from the program auto block_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg>(program, "mmul"); printf("\n===== OpenCL, A and B in block form in local memory, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); int blocksize = 16; cl::NDRange global(Ndim, Mdim); cl::NDRange local(blocksize, blocksize); cl::LocalSpaceArg localmem1 = cl::Local(sizeof(float) * blocksize * blocksize); cl::LocalSpaceArg localmem2 = cl::Local(sizeof(float) * blocksize * blocksize); block_mmul(cl::EnqueueArgs(queue, global, local), Mdim, Ndim, Pdim, d_a, d_b, d_c, localmem1, localmem2); queue.finish(); run_time = wtime() - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main() { double *a; double *b; double *c; int i = 0, j = 0, k = 0; int *events; // Array of events long long *values; // Array of values events int EventSet = PAPI_NULL; // Handle for a PAPI event set as created by PAPI_create_eventset (3) int retval; // Test fail function int num_event = 0; // Number of events int max_event; // Number of available events int EventCode = 0; // Event code PAPI_event_info_t pset; // PAPI_event_info_t Struct Reference char evname[PAPI_MAX_STR_LEN]; // Symbol event /* Memory asignament to matrixs*/ if((a = (double *)malloc(mrows * ncolumns * sizeof(double))) == NULL) printf("Error malloc matrix a[%d]\n",mrows * ncolumns); if((b = (double *)malloc(ncolumns * pcolumns * sizeof(double))) == NULL) printf("Error malloc matrix b[%d]\n",mrows * ncolumns); if((c = (double *)malloc(mrows * pcolumns * sizeof(double))) == NULL) printf("Error malloc matrix c[%d]\n",mrows * ncolumns); /* Initialize the Matrix arrays */ initmat(a, b, mrows, ncolumns, pcolumns); /* Initialize the PAPI library */ retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) test_fail( __FILE__, __LINE__, "PAPI_library_init", retval ); /* Enable and initialize multiplex support */ retval = PAPI_multiplex_init(); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_multiplex_init", retval ); /* Create an EventSet */ retval = PAPI_create_eventset(&EventSet); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval ); /* Assign it to the CPU component */ retval = PAPI_assign_eventset_component(EventSet, 0); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_assign_eventset_component", retval ); /* Convert the EventSet to a multiplexed event set */ retval = PAPI_set_multiplex(EventSet); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_set_multiplex", retval ); /* Obtaining the number of available events */ max_event = PAPI_get_opt( PAPI_MAX_MPX_CTRS, NULL ); printf("\nNumber of available events: %d", max_event ); /* Fill up the event set with as many non-derived events as we can */ EventCode = PAPI_PRESET_MASK; do { if ( PAPI_get_event_info( EventCode, &pset ) == PAPI_OK ) { if ( pset.count && ( strcmp( pset.derived, "NOT_DERIVED" ) == 0 ) ) { retval = PAPI_add_event( EventSet, ( int ) pset.event_code ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_add_event", retval ); else { //printf( "Added %s\n", pset.symbol ); num_event++; } } } } while ( ( PAPI_enum_event( &EventCode, PAPI_PRESET_ENUM_AVAIL ) == PAPI_OK ) && ( num_event < max_event ) ); /* Memory asignament to values and events*/ events = ( int * ) malloc( ( size_t ) num_event * sizeof ( int ) ); if ( events == NULL ) test_fail( __FILE__, __LINE__, "Error malloc events", 0 ); values = ( long long * ) malloc( ( size_t ) num_event * sizeof ( long long ) ); if ( values == NULL ) test_fail( __FILE__, __LINE__, "Erro malloc values", 0 ); /* Start counting events */ if ((retval=PAPI_start(EventSet)) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_start", retval); /* Matrix-Matrix multiply */ matmul(a, b, c, mrows, ncolumns, pcolumns); /* Read the counters */ if ((retval=PAPI_read( EventSet, values )) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_read_counters", retval); /* Stop counting events */ if ((retval=PAPI_stop( EventSet, values )) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_stop_counters", retval); /* List the events in the event set */ retval = PAPI_list_events( EventSet, events, &num_event ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_list_events", retval ); /* Print results */ printf("\nNumber of non-zero events: %d\n", num_event ); printf( "\nCounts of non-zero available events........................................................\n" ); printf("Name: \t\t\t Value: \t Description:\n"); for ( i = 0; i < num_event; i++ ) { PAPI_event_code_to_name( events[i], evname ); // Obtaining name of available events PAPI_get_event_info(events[i], &pset); if ( values[i] != 0 ) printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr); } printf( "\nCounts of zero available events............................................................\n" ); printf("Name: \t\t\t Value: \t Description:\n"); for ( i = 0; i < num_event; i++ ) { PAPI_event_code_to_name( events[i], evname ); // Obtaining name of available events PAPI_get_event_info(events[i], &pset); if ( values[i] == 0 ) printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr); } /* Check if counter pair(s) had identical values */ for ( i = 0; i < num_event; i++ ) { for ( i = j+1; j < num_event; j++ ) { if ( ( i != j ) && ( values[i] == values[j] ) ) k++; } } if ( k != 0 ) { printf( "\nCaution: %d counter pair(s) had identical values\n", k ); } printf("\n"); /* Free memory */ free( events ); free( values ); free( a ); free( b ); free( c ); /* Cleaning events */ retval = PAPI_cleanup_eventset( EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval ); /* Destroying events */ retval = PAPI_destroy_eventset( &EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval ); return 0; }
int main(int argc, char *argv[]) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int N; // A[N][N], B[N][N], C[N][N] int size; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel N = ORDER; size = N * N; h_A = (float *)malloc(size * sizeof(float)); h_B = (float *)malloc(size * sizeof(float)); h_C = (float *)malloc(size * sizeof(float)); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); //-------------------------------------------------------------------------------- // Run sequential version on the host //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = wtime(); seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = wtime() - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_A, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_B, &err); checkError(err, "Creating buffer d_b"); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, &err); checkError(err, "Creating buffer d_c"); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_elem.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_elem.cl"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {N, N}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Copying back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_row.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row.cl"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program from C_row_priv.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row_priv.cl"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(void) { int N; // A[N][N], B[N][N], C[N][N] int sz; // number of elements in each matrix float tmp; N = ORDER; sz = N * N; std::vector<float> h_A(sz); // Matrix A on the host std::vector<float> h_B(sz); // Matrix B on the host std::vector<float> h_C(sz); // Matrix C on the host cl::Buffer d_A; // matrix A on the device cl::Buffer d_B; // matrix B on the device cl::Buffer d_C; // matrix C on the device initmat(N, N, N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on CPU ======\n",ORDER); zero_mat(N, N, h_C); util::Timer timer; for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { tmp = 0.0f; for (int k = 0; k < N; k++) { tmp += h_A[i*N+k] * h_B[k*N+j]; } h_C[i*N+j] = tmp; } } double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); printf("\n===== Parallel matrix mult (dot prod), order %d on device ======\n",ORDER); switch (DEVICE) { case CL_DEVICE_TYPE_DEFAULT: printf("DEVICE=DEFAULT\n"); break; case CL_DEVICE_TYPE_CPU: printf("DEVICE=CPU\n"); break; case CL_DEVICE_TYPE_GPU: printf("DEVICE=GPU\n"); break; default: printf("DEVICE=%d\n", DEVICE); break; } zero_mat(N, N, h_C); try { cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context. // Build program explicitly so I can catch errors and display // compiler error messages (should any be generated) cl::Program program(context, util::loadProgram("matmul_kernel.cl")); try { program.build(); } catch (cl::Error error) { // If it was a build error then show the error if (error.err() == CL_BUILD_PROGRAM_FAILURE) { std::vector<cl::Device> devices; devices = context.getInfo<CL_CONTEXT_DEVICES>(); std::string built = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0]); std::cerr << built << "\n"; } throw error; } // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto mmul = cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer, cl::LocalSpaceArg, cl::LocalSpaceArg> (program, "mmul"); util::Timer timer; d_A = cl::Buffer(context, begin(h_A), end(h_A), true); d_B = cl::Buffer(context, begin(h_B), end(h_B), true); d_C = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * sz); // Work-group computes a block of C. This size is also set // in a #define inside the kernel function. Note this blocksize // must evenly divide the matrix order int blocksize = 16; cl::LocalSpaceArg A_block = cl::Local(sizeof(float) * blocksize*blocksize); cl::LocalSpaceArg B_block = cl::Local(sizeof(float) * blocksize*blocksize); mmul( cl::EnqueueArgs( queue, cl::NDRange(N,N), cl::NDRange(blocksize,blocksize)), N, d_A, d_B, d_C, A_block, B_block); cl::copy(queue, d_C, begin(h_C), end(h_C)); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; results(N, N, N, h_C, rtime); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << std::endl; } }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }