BLARGG_EXPORT int fex_err_code( fex_err_t err ) { int code = err_code( err ); return (code >= 0 ? code : fex_err_generic); }
int main(int argc, char** argv) { if (argc != 2) { std::cout << "Usage: ./pi_vocl num\n" << "\twhere num = 1, 4 or 8\n"; return EXIT_FAILURE; } int vector_size = atoi(argv[1]); // Define some vector size specific constants unsigned int ITERS, WGS; if (vector_size == 1) { ITERS = 262144; WGS = 8; } else if (vector_size == 4) { ITERS = 262144 / 4; WGS = 32; } else if (vector_size == 8) { ITERS = 262144 / 8; WGS = 64; } else { std::cerr << "Invalid vector size\n"; return EXIT_FAILURE; } // Set some default values: // Default number of steps (updated later to device preferable) unsigned int in_nsteps = INSTEPS; // Default number of iterations unsigned int niters = ITERS; unsigned int work_group_size = WGS; try { // Create context, queue and build program cl::Context context(DEVICE); cl::CommandQueue queue(context); cl::Program program(context, util::loadProgram("../pi_vocl.cl"), true); cl::Kernel kernel; // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size unsigned int nwork_groups = in_nsteps/(work_group_size*niters); // Get the max work group size for the kernel pi on our device unsigned int max_size; std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); if (vector_size == 1) { kernel = cl::Kernel(program, "pi"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } else if (vector_size == 4) { kernel = cl::Kernel(program, "pi_vec4"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } else if (vector_size == 8) { kernel = cl::Kernel(program, "pi_vec8"); max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]); } if (max_size > work_group_size) { work_group_size = max_size; nwork_groups = in_nsteps/(nwork_groups*niters); } if (nwork_groups < 1) { nwork_groups = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); work_group_size = in_nsteps/(nwork_groups*niters); } unsigned int nsteps = work_group_size * niters * nwork_groups; float step_size = 1.0f / (float) nsteps; // Vector to hold partial sum std::vector<float> h_psum(nwork_groups); std::cout << nwork_groups << " work groups of size " << work_group_size << ".\n" << nsteps << " Integration steps\n"; cl::Buffer d_partial_sums(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups); // Start the timer util::Timer timer; // Execute the kernel over the entire range of our 1d input data et // using the maximum number of work group items for this device cl::NDRange global(nwork_groups * work_group_size); cl::NDRange local(work_group_size); kernel.setArg(0, niters); kernel.setArg(1, step_size); cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * work_group_size); kernel.setArg(2, localmem); kernel.setArg(3, d_partial_sums); queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local); cl::copy(queue, d_partial_sums, h_psum.begin(), h_psum.end()); // Complete the sum and compute the final integral value float pi_res = 0.0; for (std::vector<float>::iterator x = h_psum.begin(); x != h_psum.end(); x++) pi_res += *x; pi_res *= step_size; // Stop the timer double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.; std::cout << "The calculation ran in " << rtime << " seconds\n" << " pi = " << pi_res << " for " << nsteps << " steps\n"; return EXIT_SUCCESS; } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; return EXIT_FAILURE; } }
/* * Returns a new instance of an LDM proxy. Can take a while because it * establishes a connection to the LDM. * * Arguments: * host Identifier of the host on which an LDM server is * running. * instance Pointer to a pointer to the new instance. "*instance" * is set upon successful return. * Returns: * 0 Success. "*instance" is set. * LP_SYSTEM System error. "log_start()" called. * LP_TIMEDOUT Connection attempt timed-out. "log_start()" called. * LP_HOSTUNREACH Host is unreachable. "log_start()" called. * LP_RPC_ERROR RPC error. "log_start()" called. * LP_LDM_ERROR LDM error. "log_start()" called. */ LdmProxyStatus lp_new( const char* const host, LdmProxy** const instance) { LdmProxyStatus status = 0; /* success */ size_t nbytes = sizeof(LdmProxy); LdmProxy* proxy = (LdmProxy*)malloc(nbytes); if (NULL == proxy) { log_serror("Couldn't allocate %lu bytes for new LdmProxy", nbytes); status = LP_SYSTEM; } else { proxy->host = strdup(host); if (NULL == proxy->host) { LOG_SERROR1("Couldn't duplicate string \"%s\"", host); status = LP_SYSTEM; } else { CLIENT* clnt = NULL; ErrorObj* error = ldm_clnttcp_create_vers(host, LDM_PORT, 6, &clnt, NULL, NULL); if (!error) { proxy->version = 6; proxy->hiya = my_hiya_6; proxy->send = my_send_6; proxy->flush = my_flush_6; } else if (LDM_CLNT_BAD_VERSION == err_code(error)) { /* Couldn't connect due to protocol version. */ err_free(error); error = ldm_clnttcp_create_vers(host, LDM_PORT, 5, &clnt, NULL, NULL); if (!error) { proxy->version = 5; proxy->hiya = my_hiya_5; proxy->send = my_send_5; proxy->flush = my_flush_5; } } if (error) { LOG_START1("%s", err_message(error)); err_free(error); free(proxy->host); status = convertStatus(error); } else { proxy->clnt = clnt; proxy->rpcTimeout = rpcTimeout; } } /* "proxy->host" allocated */ if (LP_OK == status) { *instance = proxy; } else { free(proxy); } } /* "proxy" allocated */ return status; }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c(LENGTH, 0xdeadbeef); // c = a + b, from compute device cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int> vadd(program, "vadd"); d_a = cl::Buffer(context, h_a.begin(), h_a.end(), true); d_b = cl::Buffer(context, h_b.begin(), h_b.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); util::Timer timer; vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c, count); queue.finish(); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; printf("\nThe kernels ran in %lf seconds\n", rtime); cl::copy(queue, d_c, h_c.begin(), h_c.end()); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i]; // expected value for d_c[i] tmp -= h_c[i]; // compute errors if(tmp*tmp < TOL*TOL) { // correct if square deviation is less correct++; // than tolerance squared } else { printf( " tmp %f h_a %f h_b %f h_c %f \n", tmp, h_a[i], h_b[i], h_c[i]); } } // summarize results printf( "vector add to find C = A+B: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(int argc, char** argv) { cl_int err; // error code returned from OpenCL calls size_t dataSize = sizeof(float) * LENGTH; float* h_a = (float *)malloc(dataSize); // a vector float* h_b = (float *)malloc(dataSize); // b vector float* h_c = (float *)malloc(dataSize); // c vector (result) float* h_d = (float *)malloc(dataSize); // d vector (result) float* h_e = (float *)malloc(dataSize); // e vector float* h_f = (float *)malloc(dataSize); // f vector (result) float* h_g = (float *)malloc(dataSize); // g vector unsigned int correct; // number of correct results size_t global; // global domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector cl_mem d_d; // device memory used for the output d vector cl_mem d_e; // device memory used for the input e vector cl_mem d_f; // device memory used for the output f vector cl_mem d_g; // device memory used for the input g vector // Fill vectors a and b with random float values int i = 0; for(i = 0; i < LENGTH; i++){ h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_e[i] = rand() / (float)RAND_MAX; h_g[i] = rand() / (float)RAND_MAX; } // Set up platform and GPU device cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); if (numPlatforms == 0) { printf("Found 0 platforms!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); checkError(err, "Getting platforms"); // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) { break; } } if (device_id == NULL) checkError(err, "Getting device"); err = output_device_info(device_id); checkError(err, "Outputting device info"); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); checkError(err, "Creating command queue"); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); checkError(err, "Creating kernel"); // Create the input (a, b, e, g) arrays in device memory // NB: we copy the host pointers here too d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_a, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_b, &err); checkError(err, "Creating buffer d_b"); d_e = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_e, &err); checkError(err, "Creating buffer d_e"); d_g = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, dataSize, h_g, &err); checkError(err, "Creating buffer d_g"); // Create the output arrays in device memory d_c = clCreateBuffer(context, CL_MEM_READ_WRITE, dataSize, NULL, &err); checkError(err, "Creating buffer d_c"); d_d = clCreateBuffer(context, CL_MEM_READ_WRITE, dataSize, NULL, &err); checkError(err, "Creating buffer d_d"); d_f = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err); checkError(err, "Creating buffer d_f"); const int count = LENGTH; // Enqueue kernel - first time // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &count); checkError(err, "Setting kernel arguments"); // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = count; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 1st time"); // Enqueue kernel - second time // Set different arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_e); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_d); checkError(err, "Setting kernel arguments"); // Enqueue the kernel again err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 2nd time"); // Enqueue kernel - third time // Set different (again) arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_g); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_d); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_f); checkError(err, "Setting kernel arguments"); // Enqueue the kernel again err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel 3rd time"); // Read back the result from the compute device err = clEnqueueReadBuffer( commands, d_f, CL_TRUE, 0, sizeof(float) * count, h_f, 0, NULL, NULL ); checkError(err, "Reading back d_f"); // Test the results correct = 0; float tmp; for(i = 0; i < count; i++) { tmp = h_a[i] + h_b[i] + h_e[i] + h_g[i]; // assign element i of a+b+e+g to tmp tmp -= h_f[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else { printf(" tmp %f h_a %f h_b %f h_e %f h_g %f h_f %f\n",tmp, h_a[i], h_b[i], h_e[i], h_g[i], h_f[i]); } } // summarize results printf("C = A+B+E+G: %d out of %d results were correct.\n", correct, count); // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseMemObject(d_d); clReleaseMemObject(d_e); clReleaseMemObject(d_f); clReleaseMemObject(d_g); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); free(h_a); free(h_b); free(h_c); free(h_d); free(h_e); free(h_f); free(h_g); return 0; }
static int err_code_pkt(DUSBVirtualPacket *pkt) { return err_code((((uint16_t)pkt->data[0]) << 8) | pkt->data[1]); }
/* * Attempts to connect to an upstream LDM using a range of LDM versions. The * versions are tried, in order, from highest to lowest. This function returns * on the first successful attempt. If the host is unknown or the RPC call * times-out, then the version-loop is prematurely terminated and this function * returns immediately. * * The client is responsible for freeing the client resources set by this * function on success. Calls exitIfDone() after potentially lengthy * operations. * * Arguments: * upName The name of the upstream LDM host. * port The port on which to connect. * version Program version. * *client Pointer to CLIENT structure. Set on success. * *socket The socket used for the connection. May be NULL. * *upAddr The IP address of the upstream LDM host. Set on * success. May be NULL. * Returns: * NULL Success. *vers_out, *client, *sock_out, and *upAddr * set. * !NULL Error. "*client" is not set. err_code(RETURN_VALUE): * LDM_CLNT_UNKNOWN_HOST Unknown upstream host. * LDM_CLNT_TIMED_OUT Call to upstream host timed-out. * LDM_CLNT_BAD_VERSION Upstream LDM isn't given version. * LDM_CLNT_NO_CONNECT Other connection-related error. * LDM_CLNT_SYSTEM_ERROR A fatal system-error occurred. */ ErrorObj* ldm_clnttcp_create_vers( const char* const upName, const unsigned port, unsigned const version, CLIENT** const client, int* const socket, struct sockaddr_in* upAddr) { ErrorObj* error; struct sockaddr_in addr; assert(upName != NULL); assert(client != NULL); /* * Get the IP address of the upstream LDM. This is a potentially * lengthy operation. */ (void)exitIfDone(0); error = ldm_clnt_addr(upName, &addr); if (error) { error = ERR_NEW1(LDM_CLNT_UNKNOWN_HOST, error, "Couldn't get IP address of host %s", upName); } else { int sock; int errCode; CLIENT* clnt = NULL; /* * Connect to the remote port. This is a potentially lengthy * operation. */ (void)exitIfDone(0); error = ldm_clnt_tcp_create(&addr, version, port, &clnt, &sock); if (error) { errCode = err_code(error); if (LDM_CLNT_NO_CONNECT != errCode) { error = ERR_NEW3(errCode, error, "Couldn't connect to LDM %d on %s " "using port %d", version, upName, port); } else { err_log_and_free( ERR_NEW3(0, error, "Couldn't connect to LDM %d on %s using port " "%d", version, upName, port), ERR_INFO); /* * Connect using the portmapper. This is a * potentially lengthy operation. */ (void)exitIfDone(0); error = ldm_clnt_tcp_create(&addr, version, 0, &clnt, &sock); if (error) { error = ERR_NEW2(err_code(error), error, "Couldn't connect to LDM on %s " "using either port %d or portmapper", upName, port); } /* portmapper failure */ } /* non-fatal port failure */ } /* port failure */ else { assert(!error); /* * Success. Set the return arguments. */ *client = clnt; if (socket) *socket = sock; if (upAddr) *upAddr = addr; } /* clnt != NULL */ } /* got upstream IP address */ return error; }
void TransformExport::Export() { ModelExporter & modelExporter = ModelExporter::GetExporter(); modelExporter.ResetTimeline(); MStatus stat; MItDependencyNodes itDep(MFn::kTransform,&stat); while (!itDep.isDone()) { MObject obj = itDep.item(); MFnTransform transform(obj, &stat); err_code(stat); MString cmd = MString("reference -q -f ") + transform.name(); MString file_id; stat = MGlobal::executeCommand( cmd, file_id ); if( stat == MS::kSuccess ) { itDep.next(); continue; } MString transformName = transform.name(&stat); err_code(stat); unsigned int parentCount = transform.parentCount(&stat); err_code(stat); unsigned int childCount = transform.childCount(&stat); err_code(stat); bool doExport = false; if (childCount == 0) { doExport = true; // Tip JOINTS } for (unsigned int child = 0 ; child < childCount ; child++) { MObject childObj = transform.child(child,&stat); err_code(stat); MFn::Type childType = childObj.apiType(); if ( modelExporter.CheckChildType(childType) ) { doExport = true; break; } } if (!doExport) { itDep.next(); continue; } TransformData * pTransformData = new TransformData(); pTransformData->name = transformName.asChar(); MObject parentObj = transform.parent(0,&stat); MFn::Type parentType = parentObj.apiType(); if ( modelExporter.CheckParentType(parentType) ) { MFnDagNode parentDagNode(parentObj, &stat); MString parentName = parentDagNode.name(&stat); if (parentName.length() > 0) pTransformData->parentName = parentName.asChar(); } if (parentType != MFn::kJoint && obj.apiType() == MFn::kJoint) { modelExporter.mSkeletonRoot = transform.name().asChar(); } MVector translate = transform.getTranslation(MSpace::kTransform, &stat); err_code(stat); MVector pivot = transform.rotatePivotTranslation(MSpace::kTransform, &stat); err_code(stat); pTransformData->tx = (float)translate.x; pTransformData->ty = (float)translate.y; pTransformData->tz = (float)translate.z; pTransformData->px = (float)pivot.x; pTransformData->py = (float)pivot.y; pTransformData->pz = (float)pivot.z; //double rx,ry,rz,rw; MQuaternion quat; stat = transform.getRotation(quat); err_code(stat); if (transform.object().hasFn(MFn::kJoint)) { err_code(stat); MFnIkJoint joint(transform.object(), &stat); err_code(stat); MQuaternion RO; MQuaternion R; MQuaternion JO; MQuaternion IS; // We dont have time for this. stat = joint.getScaleOrientation(RO); err_code(stat); stat = joint.getRotation(R); err_code(stat); stat = joint.getOrientation(JO); err_code(stat); quat = RO*R*JO; } //stat = transform.getRotationQuaternion(rx,ry,rz,rw, MSpace::kTransform); //err_code(stat); pTransformData->rx = -(float)quat.x; pTransformData->ry = -(float)quat.y; pTransformData->rz = -(float)quat.z; pTransformData->rw = (float)quat.w; double scale[3]; stat = transform.getScale(scale); err_code(stat); pTransformData->sx = (float)scale[0]; pTransformData->sy = (float)scale[1]; pTransformData->sz = (float)scale[2]; pTransformData->index = modelExporter.mTransformCount; modelExporter.mSceneTransforms.push_back(pTransformData); modelExporter.mSceneTransformsTable[transformName.asChar()] = modelExporter.mTransformCount; modelExporter.mTransformCount++; stat = itDep.next(); err_code(stat); } vector<TransformData*>::iterator nodesIter = modelExporter.mSceneTransforms.begin(); while (nodesIter != modelExporter.mSceneTransforms.end()) { TransformData * pTransformData = *nodesIter; if (pTransformData->parentName.length() > 0) pTransformData->parent = modelExporter.mSceneTransformsTable[pTransformData->parentName]; nodesIter++; } WriteTransforms(); }
int main(void) { std::vector<float> h_a(count); // a vector std::vector<float> h_b(count); // b vector std::vector<float> h_c(count, 0xdeadbeef); // c = a + b cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the output c vector // Fill vectors a and b with random float values for (size_t i = 0; i < count; i++) { h_a[i] = rand_r(&seed) / static_cast<float>(UINT32_MAX); h_b[i] = rand_r(&seed) / static_cast<float>(UINT32_MAX); } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count); util::Timer timer; vadd(cl::EnqueueArgs(queue, cl::NDRange(count)), d_a, d_b, d_c, static_cast<int>(count)); queue.finish(); double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; printf("\nThe kernels ran in %lf seconds\n", rtime); cl::copy(queue, d_c, begin(h_c), end(h_c)); // NOLINT // Test the results int correct = 0; float tmp; for (size_t i = 0; i < count; i++) { tmp = h_a[i] + h_b[i]; // expected value for d_c[i] tmp -= h_c[i]; // compute errors if (tmp * tmp < tolerance * tolerance) correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n", tmp, h_a[i], h_b[i], h_c[i]); } // summarize results printf("vector add to find C = A+B: %d out of %zu results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")\n"; } }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Output information err = output_device_info(device_id); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); if (!kernel_pi || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err)); return EXIT_FAILURE; } // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nwork_groups * work_group_size; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
int main(int argc, char *argv[]) { // Declare variables - y', L // Load from file? Declare within host? Type y[K]; Type L[K*K]; Type R[K]; Type m[K]; Complex Xml[K]; int check_result; cl_mem input_y; cl_mem input_L; cl_mem output_xml; // OpenCL-specific variables cl_device_id device_id; cl_platform_id platform_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel SDkernel; cl_int dev_type; cl_int error; cl_event event; FILE *kernel; char *kernelSRC; size_t global[2]; size_t local[2]; if(argc > 1) { kernel = argv[1]; } else printf("\nError - must specify arguments\n"); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &error); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(error)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &error); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(error)); return EXIT_FAILURE; } // Create buffers for each argument of kernel // Need to add for R^2 and m input_y = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Type) * K, y, &err); input_L = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(Type) * K * K, L, &err); output_xml = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(Complex) * K, Xml, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSRC, NULL, &error); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "SD", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, input_y, CL_TRUE, 0, sizeof(Type)*K, y, 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, input_L, CL_TRUE, 0, sizeof(Type)*K*K, L, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Error: could not write buffer\nError code %d\n", err); return EXIT_FAILURE; } err = clSetKernelArg(SDkernel, 0, sizeof(cl_mem), &input_y); err |= clSetKernelArg(SDkernel, 1, sizeof(cl_mem), &input_L); err |= clSetKernelArg(SDkernel, 2, sizeof(cl_mem), &input_R); err |= clSetKernelArg(SDkernel, 3, sizeof(cl_mem), &output_m); err |= clSetKernelArg(SDkernel, 4, sizeof(cl_mem), &output_xml); if(err != CL_SUCCESS) { printf("Error: could not set kernel arguments\nError code %d\n", err); return EXIT_FAILURE; } global[0] = K; global[1] = K; local[0] = K; local[1] = 1; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, &event); if(err != CL_SUCCESS) { printf("Error: could not set ND range\nError code %d\n", err); return EXIT_FAILURE; } clEnqueueReadBuffer(commands, output_m, CL_TRUE, 0, sizeof(Type)*K, m, 0, NULL, NULL); clEnqueueReadBuffer(commands, output_xml, CL_TRUE, 0, sizeof(Type)*K, m, 0, NULL, NULL); clReleaseMemObject(input_y); clReleaseMemObject(input_L); clReleaseMemObject(output_xml); clReleaseProgram(program); clReleaseKernel(SDkernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(void) { std::vector<float> h_a(LENGTH); // a vector std::vector<float> h_b(LENGTH); // b vector std::vector<float> h_c (LENGTH); // c vector std::vector<float> h_r (LENGTH, 0xdeadbeef); // d vector (result) cl::Buffer d_a; // device memory used for the input a vector cl::Buffer d_b; // device memory used for the input b vector cl::Buffer d_c; // device memory used for the input c vector cl::Buffer d_r; // device memory used for the output r vector // Fill vectors a and b with random float values int count = LENGTH; for(int i = 0; i < count; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_c[i] = rand() / (float)RAND_MAX; } try { // Create a context cl::Context context(DEVICE); // Load in kernel source, creating a program object for the context cl::Program program(context, util::loadProgram("vadd_abc.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel functor auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int>(program, "vadd"); d_a = cl::Buffer(context, begin(h_a), end(h_a), true); d_b = cl::Buffer(context, begin(h_b), end(h_b), true); d_c = cl::Buffer(context, begin(h_c), end(h_c), true); d_r = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); vadd( cl::EnqueueArgs( queue, cl::NDRange(count)), d_a, d_b, d_c, d_r, count); cl::copy(queue, d_r, begin(h_r), end(h_r)); // Test the results int correct = 0; float tmp; for(int i = 0; i < count; i++) { tmp = h_a[i] + h_b[i] + h_c[i]; // assign element i of a+b+c to tmp tmp -= h_r[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else { printf(" tmp %f h_a %f h_b %f h_c %f h_r %f \n",tmp, h_a[i], h_b[i], h_c[i], h_r[i]); } } // summarize results printf("R = A+B+C: %d out of %d results were correct.\n", correct, count); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(void) { BOOL_T err = CO_FALSE; /* error flag */ UNSIGNED8 temp = 0; uword time_lo = 0, time_hi = 0; /*--- hardware initialization e.g SIO, Chip-Selects, ...----------*/ iniDevice(); IO_vInit(); USIC0_vInit(); USIC1_vInit(); USIC2_vInit(); #if HW_KPS == 1 USIC3_vInit(); #endif //RTC init DAVE_vUnlockProtecReg(); RTC_vInit(); NOP(); /* one cycle delay */ NOP(); /* one cycle delay */ DAVE_vLockProtecReg(); lNodeId = (~IO_uwReadPort(P2))&0xFF; if (lNodeId > 127) { lNodeId = 127; } temp = (~IO_uwReadPort(P10))&0x0F; switch(temp){ case 0: bitRate = 10; break; case 1: bitRate = 20; break; case 2: bitRate = 50; break; case 3: bitRate = 125; break; case 4: bitRate = 250; break; case 5: bitRate = 500; break; case 0x9: case 0xE: case 0xF: bitRate = 125; if (((~IO_uwReadPort(P2)) & 0xFF) == 0){ lNodeId = 127; } break; default: bitRate = 500; break; } initCan(bitRate); init_Library(); initTimer(); Start_CAN(); ENABLE_CPU_INTERRUPTS(); RTC_vSetTime(0,0); sw_init(); //PRINTF("loop\n"); while (err == CO_FALSE) { FlushMbox(); /* Do the CANopen job */ myRTC_ulGetTime(&time_lo, &time_hi); //odczyt temperatury wewn. status_wewn[2] = ad7814_read(U2C0, SPI_CS3); sw_loop(); //sygnalizacja na diodach if (blink(time_lo, ch1_led_st)) { //St1 IO_vResetPin(IO_P0_0_LED_ST1); } else { IO_vSetPin(IO_P0_0_LED_ST1); } if (blink(time_lo, ch2_led_st)) { //St2 IO_vResetPin(IO_P4_1_LED_ST2); } else { IO_vSetPin(IO_P4_1_LED_ST2); } if (blink(time_lo, ch1_led_err)) { //Err1 IO_vResetPin(IO_P4_2_LED_ERR1); } else { IO_vSetPin(IO_P4_2_LED_ERR1); } if (blink(time_lo, ch2_led_err)) { //Err2 IO_vResetPin(IO_P4_0_LED_ERR2); } else { IO_vSetPin(IO_P4_0_LED_ERR2); } //w funkcji err_code dopisac cykliczna informacje o kilku bledach (numer bledu jako bity w slowie "dev_led_st" a nie wartosc) if (err_code(time_lo, dev_led_st)) { //err code - Yellow diode IO_vResetPin(IO_P2_8_LED_C); } else { IO_vSetPin(IO_P2_8_LED_C); } /* give a chance to finish the loop */ err = endLoop(); } PRINTF("\nSTOP\n"); Stop_CAN(); DISABLE_CPU_INTERRUPTS(); releaseTimer(); ResetIntMask(); deinit_Library(); return 0; }
int main(void) { int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; std::vector<float> h_A(szA); // Host memory for Matrix A std::vector<float> h_B(szB); // Host memory for Matrix B std::vector<float> h_C(szC); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } try { //-------------------------------------------------------------------------------- // Create a context and queue for DEVICE //-------------------------------------------------------------------------------- cl::Context context(DEVICE); cl::CommandQueue queue(context); //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = cl::Buffer(context, begin(h_A), end(h_A), true); d_b = cl::Buffer(context, begin(h_B), end(h_B), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program auto naive_mmul = cl::make_kernel<int, int, int, cl::Buffer, cl::Buffer, cl::Buffer>(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(Ndim, Mdim); naive_mmul(cl::EnqueueArgs(queue, global), Mdim, Ndim, Pdim, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, begin(h_C), end(h_C)); results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
BLARGG_EXPORT const char* fex_err_details( fex_err_t err ) { // If we don't have error code assigned, return entire string return (err_code( err ) >= 0 ? blargg_err_details( err ) : blargg_err_str( err )); }
int main(int argc, char *argv[]) { int N; // A[N][N], B[N][N], C[N][N] int size; // Number of elements in each matrix double start_time; // Starting time double run_time; // Timing util::Timer timer; // Timing N = ORDER; size = N * N; std::vector<float> h_A(size); // Host memory for Matrix A std::vector<float> h_B(size); // Host memory for Matrix B std::vector<float> h_C(size); // Host memory for Matrix C cl::Buffer d_a, d_b, d_c; // Matrices in device memory //-------------------------------------------------------------------------------- // Create a context and queue //-------------------------------------------------------------------------------- try { cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices std::vector<cl::Device> devices; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { std::cout << "Invalid device index (try '--list')\n"; return EXIT_FAILURE; } cl::Device device = devices[deviceIndex]; std::string name; getDeviceName(device, name); std::cout << "\nUsing OpenCL device: " << name << "\n"; std::vector<cl::Device> chosen_device; chosen_device.push_back(device); cl::Context context(chosen_device); cl::CommandQueue queue(context, device); //-------------------------------------------------------------------------------- // Run sequential matmul //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); timer.reset(); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",N); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = cl::Buffer(context, h_A.begin(), h_A.end(), true); d_b = cl::Buffer(context, h_B.begin(), h_B.end(), true); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- // Create the compute program from the source buffer cl::Program program(context, kernelsource, true); // Create the compute kernel from the program cl::make_kernel<int, cl::Buffer, cl::Buffer, cl::Buffer> naive_mmul(program, "mmul"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0; // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. cl::NDRange global(N, N); naive_mmul(cl::EnqueueArgs(queue, global), N, d_a, d_b, d_c); queue.finish(); run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0 - start_time; cl::copy(queue, d_c, h_C.begin(), h_C.end()); results(N, h_C, run_time); } // end for loop } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } return EXIT_SUCCESS; }
int main(void) { try { // Discover number of platforms std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); std::cout << "\nNumber of OpenCL plaforms: " << platforms.size() << std::endl; // Investigate each platform std::cout << "\n-------------------------" << std::endl; for (std::vector<cl::Platform>::iterator plat = platforms.begin(); plat != platforms.end(); plat++) { std::string s; plat->getInfo(CL_PLATFORM_NAME, &s); std::cout << "Platform: " << s << std::endl; plat->getInfo(CL_PLATFORM_VENDOR, &s); std::cout << "\tVendor: " << s << std::endl; plat->getInfo(CL_PLATFORM_VERSION, &s); std::cout << "\tVersion: " << s << std::endl; // Discover number of devices std::vector<cl::Device> devices; plat->getDevices(CL_DEVICE_TYPE_ALL, &devices); std::cout << "\n\tNumber of devices: " << devices.size() << std::endl; // Investigate each device for (std::vector<cl::Device>::iterator dev = devices.begin(); dev != devices.end(); dev++ ) { std::cout << "\t-------------------------" << std::endl; dev->getInfo(CL_DEVICE_NAME, &s); std::cout << "\t\tName: " << s << std::endl; dev->getInfo(CL_DEVICE_OPENCL_C_VERSION, &s); std::cout << "\t\tVersion: " << s << std::endl; int i; dev->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &i); std::cout << "\t\tMax. Compute Units: " << i << std::endl; size_t size; dev->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &size); std::cout << "\t\tLocal Memory Size: " << size/1024 << " KB" << std::endl; dev->getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &size); std::cout << "\t\tGlobal Memory Size: " << size/(1024*1024) << " MB" << std::endl; dev->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); std::cout << "\t\tMax Alloc Size: " << size/(1024*1024) << " MB" << std::endl; dev->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); std::cout << "\t\tMax Work-group Total Size: " << size << std::endl; std::vector<size_t> d; dev->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &d); std::cout << "\t\tMax Work-group Dims: ("; for (std::vector<size_t>::iterator st = d.begin(); st != d.end(); st++) std::cout << *st << " "; std::cout << "\x08)" << std::endl; std::cout << "\t-------------------------" << std::endl; } std::cout << "\n-------------------------\n"; } } catch (cl::Error err) { std::cout << "OpenCL Error: " << err.what() << " returned " << err_code(err.err()) << std::endl; std::cout << "Check cl.h for error codes." << std::endl; system("pause"); exit(-1); } system("pause"); return 0; }
int main(int argc, char** argv) { if (argc != 2) { printf("Usage: ./pi_vocl num\n"); printf("\twhere num = 1, 4 or 8\n"); return EXIT_FAILURE; } int vector_size = atoi(argv[1]); // Define some vector size specific constants unsigned int ITERS, WGS; if (vector_size == 1) { ITERS = 262144; WGS = 8; } else if (vector_size == 4) { ITERS = 262144 / 4; WGS = 32; } else if (vector_size == 8) { ITERS = 262144 / 8; WGS = 64; } else { fprintf(stderr, "Invalid vector size\n"); return EXIT_FAILURE; } // Set some default values: // Default number of steps (updated later to device preferable) unsigned int in_nsteps = INSTEPS; // Defaultl number of iterations unsigned int niters = ITERS; unsigned int work_group_size = WGS; // Create context, queue and build program cl_int err; cl_context context; cl_device_id device; cl_command_queue queue; cl_program program; cl_kernel kernel; // Find number of platforms cl_uint numPlatforms; err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Get all platforms cl_platform_id platforms[numPlatforms]; err = clGetPlatformIDs(numPlatforms, platforms, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(platforms[i], DEVICE, 1, &device, NULL); if (err == CL_SUCCESS) break; } if (device == NULL) die(err_code(err), __LINE__, __FILE__); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Create a command queue queue = clCreateCommandQueue(context, device, 0, &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Create the compute program from the source buffer char *kernel_source = getKernelSource("../pi_vocl.cl"); program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); die(err_code(err), __LINE__, __FILE__); } if (vector_size == 1) { kernel = clCreateKernel(program, "pi", &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); } else if (vector_size == 4) { kernel = clCreateKernel(program, "pi_vec4", &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); } else if (vector_size == 8) { kernel = clCreateKernel(program, "pi_vec8", &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); } // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size unsigned int nwork_groups = in_nsteps/(work_group_size*niters); // Get the max work group size for the kernel pi on our device size_t max_size; err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_size), &max_size, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); if (max_size > work_group_size) { work_group_size = max_size; nwork_groups = in_nsteps/(nwork_groups*niters); } if (nwork_groups < 1) { err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(nwork_groups), &nwork_groups, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); work_group_size = in_nsteps/(nwork_groups*niters); } unsigned int nsteps = work_group_size * niters * nwork_groups; float step_size = 1.0f / (float) nsteps; // Array to hold partial sum float *h_psum = (float*)calloc(nwork_groups, sizeof(float)); printf("%d work groups of size %d.\n", nwork_groups, work_group_size); printf(" %u Integration steps\n", nsteps); cl_mem d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Execute the kernel over the entire range of our 1d input data et // using the maximum number of work group items for this device const size_t global = nwork_groups * work_group_size; const size_t local = work_group_size; err = clSetKernelArg(kernel, 0, sizeof(int), &niters); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); err = clSetKernelArg(kernel, 1, sizeof(float), &step_size); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); err = clSetKernelArg(kernel, 2, sizeof(float) * work_group_size, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // Start the timer double rtime = wtime(); err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); err = clEnqueueReadBuffer(queue, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) die(err_code(err), __LINE__, __FILE__); // complete the sum and compute the final integral value on the host float pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %u steps\n", pi_res, nsteps); free(h_psum); free(kernel_source); }
int dusb_dissect_cmd_data(CalcModel model, FILE *f, const uint8_t * data, uint32_t len, uint32_t vtl_size, uint16_t vtl_type) { int ret = dusb_check_cmd_data(model, data, len, vtl_size, vtl_type); (void)vtl_size; if (ret) { return ret; } switch (vtl_type) { case DUSB_VPKT_PING: { uint16_t arg1 = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); uint16_t arg2 = (((uint16_t)(data[2])) << 8) | ((uint16_t)(data[3])); uint16_t arg3 = (((uint16_t)(data[4])) << 8) | ((uint16_t)(data[5])); uint16_t arg4 = (((uint16_t)(data[6])) << 8) | ((uint16_t)(data[7])); uint16_t arg5 = (((uint16_t)(data[8])) << 8) | ((uint16_t)(data[9])); fprintf(f, "Set mode: { %u, %u, %u, %u, 0x%04X }\n", arg1, arg2, arg3, arg4, arg5); } break; case DUSB_VPKT_PARM_REQ: { uint16_t npids = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); uint16_t i; if (len == 2U + npids * 2) { data += 2; fprintf(f, "Requested %u (%X) parameter IDs:\n", npids, npids); for (i = 0; i < npids; i++) { uint16_t pid = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); data += 2; fprintf(f, "\t%04X (%s)\n", pid, dusb_cmd_param_type2name(pid)); } fputc('\n', f); } } break; case DUSB_VPKT_PARM_DATA: { uint16_t nparams = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); uint16_t i; uint32_t additional_size = 0; if (len >= 2U + 3 * nparams) { data += 2; fprintf(f, "Received %u (%X) parameter values:\n", nparams, nparams); for (i = 0; i < nparams; i++) { uint16_t pid = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); uint8_t ok; data += 2; ok = !(*data++); fprintf(f, "\t%04X (%s): ", pid, dusb_cmd_param_type2name(pid)); if (ok) { uint16_t j; uint16_t size = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); data += 2; additional_size += size + 2; if (len < 2U + 3 * nparams + additional_size) { break; } fprintf(f, "OK, size %04X\n\t\t", size); for (j = 0; j < size;) { fprintf(f, "%02X ", *data++); if (!(++j & 15)) { fprintf(f, "\n\t\t"); } } fputc('\n', f); } else { fputs("NOK !\n", f); } } } // else do nothing. } break; case DUSB_VPKT_PARM_SET: { uint16_t id = (((uint16_t)(data[0])) << 8) | ((uint16_t)(data[1])); uint16_t size = (((uint16_t)(data[2])) << 8) | ((uint16_t)(data[3])); uint16_t i; data += 4; if (len == 4U + size) { fprintf(f, "Sending value of size %04X for parameter %04X\n\t", size, id); for (i = 0; i < size; i++) { fprintf(f, "%02X ", *data++); if (!(++i & 15)) { fprintf(f, "\n\t"); } } } // else do nothing. } break; case DUSB_VPKT_OS_BEGIN: { uint32_t size = (((uint32_t)data[7]) << 24) | (((uint32_t)data[8]) << 16) | (((uint32_t)data[9]) << 8) | (((uint32_t)data[10]) << 0); fprintf(f, "Size: %lu / %08lX\n", (unsigned long)size, (unsigned long)size); } break; case DUSB_VPKT_OS_ACK: { uint32_t size = (((uint32_t)data[0]) << 24) | (((uint32_t)data[1]) << 16) | (((uint32_t)data[2]) << 8) | (((uint32_t)data[3]) << 0); fprintf(f, "Chunk size: %lu / %08lX\n", (unsigned long)size, (unsigned long)size); } break; case DUSB_VPKT_OS_HEADER: case DUSB_VPKT_OS_DATA: { if (model == CALC_TI83PCE_USB || model == CALC_TI84PCE_USB) { uint32_t addr = (((uint32_t)data[3]) << 24) | (((uint32_t)data[2]) << 16) | (((uint32_t)data[1]) << 8) | (((uint32_t)data[0]) << 0); fprintf(f, "Address: %08lX\n", (unsigned long)addr); } else if (model != CALC_TI89T_USB) { uint16_t addr = (((uint16_t)data[0]) << 8) | (((uint32_t)data[1]) << 0); fprintf(f, "Address: %04X\tPage: %02X\tFlag: %02X\n", addr, data[2], data[3]); } // else do nothing. } break; case DUSB_VPKT_DELAY_ACK: { uint32_t delay = (((uint32_t)data[0]) << 24) | (((uint32_t)data[1]) << 16) | (((uint32_t)data[2]) << 8) | (data[3] << 0); fprintf(f, "Delay: %lu\n", (unsigned long)delay); } break; case DUSB_VPKT_ERROR: { int err = err_code((((uint16_t)data[0]) << 8) | (((uint32_t)data[1]) << 0)); fprintf(f, "Error code: %u (%04X)\n", err, err); } break; // Nothing to do. case DUSB_VPKT_VAR_CNTS: case DUSB_VPKT_MODE_SET: case DUSB_VPKT_EOT_ACK: case DUSB_VPKT_DATA_ACK: case DUSB_VPKT_EOT: break; // TODO case DUSB_VPKT_DIR_REQ: case DUSB_VPKT_VAR_HDR: case DUSB_VPKT_RTS: case DUSB_VPKT_VAR_REQ: case DUSB_VPKT_MODIF_VAR: case DUSB_VPKT_EXECUTE: { fputs("(no extra dissection performed for now)\n", f); } break; default: { fputs("(not performing extra dissection on unknown vpkt type)\n", f); } break; } return ret; }
int main(int argc, char *argv[]) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context, queue, kernel, etc. cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); checkError(err, "Creating kernel"); // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); checkError(err, "Getting kernel work group info"); // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); checkError(err, "Getting device compute unit info"); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; printf("nsteps:%d\n", nsteps); printf("niters:%d\n", niters); printf("work_group_size:%zd\n", work_group_size); printf("n work groups:%ld\n", nwork_groups); printf("step_size:%f\n", step_size); h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); checkError(err, "Creating buffer d_partial_sums"); // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); checkError(err, "Settin kernel args"); // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nsteps / niters; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); checkError(err, "Reading back d_partial_sums"); // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
int main(int argc, char *argv[]) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int N; // A[N][N], B[N][N], C[N][N] int size; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel N = ORDER; size = N * N; h_A = (float *)malloc(size * sizeof(float)); h_B = (float *)malloc(size * sizeof(float)); h_C = (float *)malloc(size * sizeof(float)); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); //-------------------------------------------------------------------------------- // Run sequential version on the host //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = wtime(); seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = wtime() - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_A, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_B, &err); checkError(err, "Creating buffer d_b"); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, &err); checkError(err, "Creating buffer d_c"); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_elem.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_elem.cl"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {N, N}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Copying back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_row.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row.cl"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program from C_row_priv.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row_priv.cl"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(int ac, char *av[]) { char myname[HOSTNAMESIZE]; char *progname = av[0]; char *logfname; unsigned version; prod_class_t clss; prod_spec spec; int seq_start = 0; int status; ErrorObj* error; unsigned remotePort = LDM_PORT; logfname = "-"; remote = "localhost"; (void)set_timestamp(&clss.from); clss.to = TS_ENDT; clss.psa.psa_len = 1; clss.psa.psa_val = &spec; spec.feedtype = DEFAULT_FEEDTYPE; spec.pattern = ".*"; { extern int optind; extern char *optarg; int ch; int logmask = (LOG_MASK(LOG_ERR) | LOG_MASK(LOG_WARNING) | LOG_MASK(LOG_NOTICE)); while ((ch = getopt(ac, av, "vxl:h:f:P:s:")) != EOF) switch (ch) { case 'v': logmask |= LOG_MASK(LOG_INFO); break; case 'x': logmask |= LOG_MASK(LOG_DEBUG); break; case 'l': logfname = optarg; break; case 'h': remote = optarg; break; case 'f': spec.feedtype = atofeedtypet(optarg); if(spec.feedtype == NONE) { fprintf(stderr, "Unknown feedtype \"%s\"\n", optarg); usage(progname); } break; case 'P': { char* suffix = ""; long port; errno = 0; port = strtol(optarg, &suffix, 0); if (0 != errno || 0 != *suffix || 0 >= port || 0xffff < port) { (void)fprintf(stderr, "%s: invalid port %s\n", av[0], optarg); usage(av[0]); } remotePort = (unsigned)port; break; } case 's': seq_start = atoi(optarg); break; case '?': usage(progname); break; } ac -= optind; av += optind; if(ac < 1) usage(progname); (void) setulogmask(logmask); } /* * Set up error logging */ (void) openulog(ubasename(progname), LOG_NOTIME, LOG_LDM, logfname); /* * register exit handler */ if(atexit(cleanup) != 0) { serror("atexit"); exit(1); } /* * set up signal handlers */ set_sigactions(); (void) strcpy(myname, ghostname()); /* * Contact the server. */ error = ldm_clnttcp_create_vers(remote, remotePort, SIX, &clnt, NULL, NULL); (void)exitIfDone(1); if (!error) { version = SIX; hiya = my_hiya_6; send_product = send_product_6; nullproc = nullproc_6; } else if (LDM_CLNT_BAD_VERSION == err_code(error)) { err_free(error); error = ldm_clnttcp_create_vers(remote, remotePort, FIVE, &clnt, NULL, NULL); (void)exitIfDone(1); if (!error) { version = FIVE; hiya = my_hiya_5; send_product = send_product_5; nullproc = NULL; } } if (error) { err_log(error, ERR_FAILURE); err_free(error); status = 1; } else { udebug("version %u", version); status = ldmsend(clnt, &clss, myname, seq_start, ac, av); } return status != 0; }
int main(int argc, char *argv[]) { static const struct option long_opts[] = { /* commands */ {"inject", required_argument, NULL, c_INJECT}, {"remove", required_argument, NULL, c_REMOVE}, {"hexdump", required_argument, NULL, c_HEXDUMP}, /* options */ {"output", required_argument, NULL, o_OUTPUT}, /* flags */ {"force", no_argument, NULL, f_FORCE}, {"p8", no_argument, NULL, f_P8}, {"verbose", no_argument, NULL, f_VERBOSE}, {"help", no_argument, NULL, f_HELP}, {0, 0, 0, 0} }; static const char *short_opts = "I:R:H:o:fpvh"; int rc = EXIT_FAILURE; if (argc == 1) usage(args.short_name, false), exit(rc); int opt = 0, idx = 0; while ((opt = getopt_long(argc, argv, short_opts, long_opts, &idx)) != -1) if (process_argument(&args, opt, optarg) < 0) goto error; /* getopt_long doesn't know what to do with orphans, */ /* so we'll scoop them up here, and deal with them later */ while (optind < argc) if (process_option(&args, argv[optind++]) < 0) goto error; if (args.verbose == f_VERBOSE) args_dump(&args); if (validate_args(&args) < 0) goto error; if (process_args(&args) < 0) goto error; rc = EXIT_SUCCESS; if (false) { err_t *err; error: err = err_get(); assert(err != NULL); fprintf(stderr, "%s: %s : %s(%d) : (code=%d) %.*s\n", program_invocation_short_name, err_type_name(err), err_file(err), err_line(err), err_code(err), err_size(err), (char *)err_data(err)); } return rc; }
int main(int argc, char * argv[]) { MStatus stat; // initialise the maya library - This basically starts up Maya if(MLibrary::initialize(argv[0], true) != MS::kSuccess) { err_stop("[ERROR] Maya failed to initialise\n"); } std::string fileName = argv[1]; cout << "Loading file: " << fileName; //std::string scale = "0.01"; std::string scale = "1.0"; std::string todo = "all"; std::string target = "tegra"; // or tegra if (argc > 2) { scale = argv[2]; } if (argc > 3) { todo = argv[3]; } if (argc > 4) { target = argv[4]; } Globals & globalFlags = Globals::GetGlobals(); globalFlags.TARGET = target; globalFlags.SCALE = (float)atof(scale.c_str()); char txt[128]={0}; sprintf(txt,"Scale: %f\n",globalFlags.SCALE); err_info(txt); for (unsigned int i=0;i<fileName.length();i++) if (fileName[i] == '\\') fileName[i] = '/'; err_info("Opening file: " + fileName); stat = MFileIO::open(fileName.c_str()); err_code(stat); int dot = fileName.rfind("."); int slash = fileName.rfind("/"); string folder = fileName; folder.erase(slash, folder.length() - slash); fileName.erase(dot, fileName.length() - dot); fileName.erase(0, slash+1); if (todo == "model" || todo == "all") { ModelExporter & modelExport = ModelExporter::GetExporter(); modelExport.Export(folder, fileName); AnimationExport & animationExport = AnimationExport::GetExporter(); animationExport.Export(folder, fileName); UVAnimationExport & uvAnimationExport = UVAnimationExport::GetExporter(); uvAnimationExport.Export(folder, fileName); } if (todo == "locator" || todo == "all") { LocatorExport & locatorExport = LocatorExport::GetExporter(); locatorExport.Export(folder, fileName); } if (todo == "ref" || todo == "all") { RefSceneExport & refExport = RefSceneExport::GetExporter(); refExport.Export(folder, fileName); } if (todo == "stop_circle" || todo == "all") { StopCircleExport & stopCircleExport = StopCircleExport::GetExporter(); stopCircleExport.Export(folder, fileName); } if (todo == "launcher" || todo == "all") { BotLauncherExport & launcherExport = BotLauncherExport::GetExporter(); launcherExport.Export(folder, fileName); } if (todo == "curve" || todo == "all") { CurveExport & curveExport = CurveExport::GetExporter(); curveExport.Export(folder, fileName); } if (todo == "wire" || todo == "all") { WireExport & wireExport = WireExport::GetExporter(); wireExport.Export(folder, fileName); } if (todo == "collision_mesh" || todo == "all") { CollisionMeshExport & cllisionMeshExport = CollisionMeshExport::GetExporter(); cllisionMeshExport.Export(folder, fileName); } if (todo == "graph_mesh" || todo == "all") { GraphExport & graphExport = GraphExport::GetExporter(); graphExport.Export(folder, fileName); } if (todo == "doors" || todo == "all") { DoorsExport & doorsExport = DoorsExport::GetExporter(); doorsExport.Export(folder, fileName); } if (todo == "physics" || todo == "all") { PhysicsExport & physicsExport = PhysicsExport::GetExporter(); physicsExport.Export(folder, fileName); } return 0; }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device prefereable) int niters = ITERS; // number of iterations int nsteps; float step_size; ::size_t nwork_groups; ::size_t max_size, work_group_size = 8; float pi_res; cl::Buffer d_partial_sums; try { // Create a context cl::Context context(DEVICE); // Create the program object cl::Program program(context, util::loadProgram("pi_ocl.cl"), true); // Get the command queue cl::CommandQueue queue(context); // Create the kernel object for quering information cl::Kernel ko_pi(program, "pi"); // Get the device we are using std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); cl::Device device = devices[0]; // Get the work group size work_group_size = ko_pi.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device); //printf("wgroup_size = %lu\n", work_group_size); auto pi = cl::make_kernel<int, float, cl::LocalSpaceArg, cl::Buffer>(program, "pi"); // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if ( nwork_groups < 1) { nwork_groups = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); work_group_size=in_nsteps / (nwork_groups*niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/static_cast<float>(nsteps); std::vector<float> h_psum(nwork_groups); printf( " %d work groups of size %d. %d Integration steps\n", (int)nwork_groups, (int)work_group_size, nsteps); d_partial_sums = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups); util::Timer timer; // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device pi( cl::EnqueueArgs( queue, cl::NDRange(nwork_groups * work_group_size), cl::NDRange(work_group_size)), niters, step_size, cl::Local(sizeof(float) * work_group_size), d_partial_sums); cl::copy(queue, d_partial_sums, begin(h_psum), end(h_psum)); // complete the sum and compute final integral value pi_res = 0.0f; for (unsigned int i = 0; i< nwork_groups; i++) { pi_res += h_psum[i]; } pi_res = pi_res * step_size; //rtime = wtime() - rtime; double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); } catch (cl::Error err) { std::cout << "Exception\n"; std::cerr << "ERROR: " << err.what() << "(" << err_code(err.err()) << ")" << std::endl; } }
int main(void) { //############################################### // // Declare variables for OpenCL // //############################################### int err; // error code returned from OpenCL calls size_t global; // global domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_calculate_imagerowdots_iterations; // compute kernel cl_kernel ko_calculate_colorrow; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory int i; //############################################### // // Set values for mandelbrot // //############################################### //plane section values float x_ebene_min = -1; float y_ebene_min = -1; float x_ebene_max = 2; float y_ebene_max = 1; //monitor resolution values const long x_mon = 640; const long y_mon = 480; //Iterations long itr = 100; //abort condition float abort_value = 2; //Number of images per second long fps = 24; //video duration in seconds long video_duration = 3; //zoom speed in percentage float reduction = 5; //zoom dot my_complex_t zoom_dot; //############################################### // // Set up platform and GPU device // //############################################### cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); if (numPlatforms == 0) { printf("Found 0 platforms!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); checkError(err, "Getting platforms"); // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) { break; } } if (device_id == NULL) checkError(err, "Finding a device"); err = output_device_info(device_id); checkError(err, "Printing device output"); //############################################### // // Create context, command queue and kernel // //############################################### // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); checkError(err, "Creating command queue"); //Read Kernel source FILE *fp; char *source_str; size_t source_size, program_size; fp = fopen("./kernel/calculate_iterations.cl", "r"); if (!fp) { printf("Failed to load kernel\n"); return 1; } fseek(fp, 0, SEEK_END); program_size = ftell(fp); rewind(fp); source_str = (char*) malloc(program_size + 1); source_str[program_size] = '\0'; fread(source_str, sizeof(char), program_size, fp); fclose(fp); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &source_str, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); return EXIT_FAILURE; } // Create the compute kernel from the program ko_calculate_imagerowdots_iterations = clCreateKernel(program, "calculate_imagerowdots_iterations", &err); checkError(err, "Creating kernel"); // Create the compute kernel from the program ko_calculate_colorrow = clCreateKernel(program, "calculate_colorrow", &err); checkError(err, "Creating kernel"); int number_images = 0; do { //Get memory for image long* h_image = (long*) calloc(x_mon * y_mon, sizeof(long)); unsigned char* h_image_pixel = (unsigned char*) calloc( x_mon * y_mon * 3, sizeof(unsigned char)); //############################################### //############################################### // // Loop to calculate image dot iterations // //############################################### //############################################### float y_value = y_ebene_max; float delta_y = delta(y_ebene_min, y_ebene_max, y_mon); for (int row = 0; row < y_mon; ++row) { //############################################### // // Create and write buffer // //############################################### //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_a to device at d_a"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_imagerowdots_iterations, 0, sizeof(float), &x_ebene_min); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 1, sizeof(float), &x_ebene_max); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 2, sizeof(float), &y_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 3, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 4, sizeof(float), &abort_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 5, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 6, sizeof(cl_mem), &d_a); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_imagerowdots_iterations(const float x_min, const float x_max, const float y_value, const long x_mon, const float abort_value, const long itr, __global long * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_imagerowdots_iterations, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } //reduce y y_value -= delta_y; //cope row to image memcpy(h_image + row * x_mon, h_image_row, sizeof(long) * x_mon); free(h_image_row); } // for (i = 0; i < x_mon * y_mon; ++i) { // printf("%ld ", h_image[i]); // } // fflush(stdout); //############################################### //############################################### // // End of loop to calculate image dot iterations // //############################################### //############################################### //############################################### //############################################### // // Beginn color calculation // //############################################### //############################################### for (int row = 0; row < y_mon; ++row) { //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector memcpy(h_image_row, h_image + row * x_mon, sizeof(long) * x_mon); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_image_row to device at d_a"); unsigned char* h_imagepixel_row = (unsigned char*) calloc(x_mon * 3, sizeof(unsigned char)); // a vector d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned char) * x_mon * 3, NULL, &err); checkError(err, "Creating buffer d_b"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); checkError(err, "Copying h_imagepixel_row to device at d_b"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_colorrow, 0, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_colorrow, 1, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_colorrow, 2, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_calculate_colorrow, 3, sizeof(cl_mem), &d_b); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_colorrow(const long width, long itr, long * imagerowvalues, unsigned char * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_colorrow, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } memcpy(h_image_pixel + row * x_mon * 3, h_imagepixel_row, sizeof(unsigned char) * x_mon * 3); free(h_image_row); free(h_imagepixel_row); } if (number_images == 0) { zoom_dot = find_dot_to_zoom(x_ebene_min, x_ebene_max, y_ebene_min, y_ebene_max, h_image, y_mon, x_mon, itr); } reduce_plane_section_focus_dot(&x_ebene_min, &x_ebene_max, &y_ebene_min, &y_ebene_max, reduction, zoom_dot); // save the image char filename[50]; sprintf(filename, "img-%d.bmp", number_images); safe_image_to_bmp(x_mon, y_mon, h_image_pixel, filename); free(h_image); free(h_image_pixel); number_images++; itr = (long) (itr + itr * reduction / 100); printf("%d\n", number_images); fflush(stdout); } while (number_images < (fps * video_duration)); //############################################### // // cleanup then shutdown // //############################################### clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseProgram(program); clReleaseKernel(ko_calculate_imagerowdots_iterations); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }