コード例 #1
0
int main(int argc, char** argv)
{
	if (argc != 2)
	{
		std::cout << "Usage: ./pi_vocl num\n"
		          << "\twhere num = 1, 4 or 8\n";
		return EXIT_FAILURE;
	}

	int vector_size = atoi(argv[1]);

	// Define some vector size specific constants
	unsigned int ITERS, WGS;
	if (vector_size == 1)
	{
		ITERS = 262144;
		WGS = 8;
	}
	else if (vector_size == 4)
	{
		ITERS = 262144 / 4;
		WGS = 32;
	}
	else if (vector_size == 8)
	{
		ITERS = 262144 / 8;
		WGS = 64;
	}
	else
	{
		std::cerr << "Invalid vector size\n";
		return EXIT_FAILURE;
	}

	// Set some default values:
	// Default number of steps (updated later to device preferable)
	unsigned int in_nsteps = INSTEPS;
	// Default number of iterations
	unsigned int niters = ITERS;
	unsigned int work_group_size = WGS;

	try
	{
		// Create context, queue and build program
		cl::Context context(DEVICE);
		cl::CommandQueue queue(context);
		cl::Program program(context, util::loadProgram("../pi_vocl.cl"), true);
		cl::Kernel kernel;

		// Now that we know the size of the work_groups, we can set the number of work
		// groups, the actual number of steps, and the step size
		unsigned int nwork_groups = in_nsteps/(work_group_size*niters);

		// Get the max work group size for the kernel pi on our device
		unsigned int max_size;
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
		if (vector_size == 1)
		{
			kernel = cl::Kernel(program, "pi");
			max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]);
		}
		else if (vector_size == 4)
		{
			kernel = cl::Kernel(program, "pi_vec4");
			max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]);
		}
		else if (vector_size == 8)
		{
			kernel = cl::Kernel(program, "pi_vec8");
			max_size = kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(devices[0]);
		}

		if (max_size > work_group_size)
		{
			work_group_size = max_size;
			nwork_groups = in_nsteps/(nwork_groups*niters);
		}

		if (nwork_groups < 1)
		{
			nwork_groups = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
			work_group_size = in_nsteps/(nwork_groups*niters);
		}

		unsigned int nsteps = work_group_size * niters * nwork_groups;
		float step_size = 1.0f / (float) nsteps;

		// Vector to hold partial sum
		std::vector<float> h_psum(nwork_groups);

		std::cout << nwork_groups << " work groups of size " << work_group_size << ".\n"
		          << nsteps << " Integration steps\n";

        cl::Buffer d_partial_sums(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups);

        // Start the timer
        util::Timer timer;

        // Execute the kernel over the entire range of our 1d input data et
        // using the maximum number of work group items for this device
        cl::NDRange global(nwork_groups * work_group_size);
        cl::NDRange local(work_group_size);

        kernel.setArg(0, niters);
        kernel.setArg(1, step_size);
        cl::LocalSpaceArg localmem = cl::Local(sizeof(float) * work_group_size);
        kernel.setArg(2, localmem);
        kernel.setArg(3, d_partial_sums);
        queue.enqueueNDRangeKernel(kernel, cl::NullRange, global, local);

        cl::copy(queue, d_partial_sums, begin(h_psum), end(h_psum));

        // Complete the sum and compute the final integral value
        float pi_res = 0.0;
        for (float x : h_psum)
            pi_res += x;
        pi_res *= step_size;

        // Stop the timer
		double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.;
        std::cout << "The calculation ran in " << rtime << " seconds\n"
                  << " pi = " << pi_res << " for " << nsteps << " steps\n";

        return EXIT_SUCCESS;


	}
	catch (cl::Error err)
	{
		std::cout << "Exception\n";
		std::cerr 
            << "ERROR: "
            << err.what()
            << "("
            << err_code(err.err())
           << ")"
           << std::endl;
        return EXIT_FAILURE;
	}
}
コード例 #2
0
int main(int argc, char *argv[])
{
    float *h_psum;					// vector to hold partial sum
    int in_nsteps = INSTEPS;		// default number of steps (updated later to device prefereable)
    int niters = ITERS;				// number of iterations
    int nsteps;
    float step_size;
    ::size_t nwork_groups;
    ::size_t max_size, work_group_size = 8;
    float pi_res;

    cl::Buffer d_partial_sums;

    try
    {
        cl_uint deviceIndex = 0;
        parseArguments(argc, argv, &deviceIndex);

        // Get list of devices
        std::vector<cl::Device> devices;
        unsigned numDevices = getDeviceList(devices);

        // Check device index in range
        if (deviceIndex >= numDevices)
        {
          std::cout << "Invalid device index (try '--list')\n";
          return EXIT_FAILURE;
        }

        cl::Device device = devices[deviceIndex];

        std::string name;
        getDeviceName(device, name);
        std::cout << "\nUsing OpenCL device: " << name << "\n";

        std::vector<cl::Device> chosen_device;
        chosen_device.push_back(device);
        cl::Context context(chosen_device);
        cl::CommandQueue queue(context, device);

        // Create the program object
        cl::Program program(context, util::loadProgram("../pi_ocl.cl"), true);

        // Create the kernel object for quering information
        cl::Kernel ko_pi(program, "pi");

        // Get the work group size
        work_group_size = ko_pi.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);
        //printf("wgroup_size = %lu\n", work_group_size);

        cl::make_kernel<int, float, cl::LocalSpaceArg, cl::Buffer> pi(program, "pi");

        // Now that we know the size of the work_groups, we can set the number of work
        // groups, the actual number of steps, and the step size
        nwork_groups = in_nsteps/(work_group_size*niters);

        if ( nwork_groups < 1) {
            nwork_groups = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
            work_group_size=in_nsteps / (nwork_groups*niters);
        }

        nsteps = work_group_size * niters * nwork_groups;
        step_size = 1.0f/static_cast<float>(nsteps);
        std::vector<float> h_psum(nwork_groups);

        printf(
            " %d work groups of size %d.  %d Integration steps\n",
            (int)nwork_groups,
            (int)work_group_size,
            nsteps);

        d_partial_sums = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups);

        util::Timer timer;

        // Execute the kernel over the entire range of our 1d input data set
        // using the maximum number of work group items for this device
        pi(
            cl::EnqueueArgs(
                    queue,
                    cl::NDRange(nsteps / niters),
                    cl::NDRange(work_group_size)),
                    niters,
                    step_size,
                    cl::Local(sizeof(float) * work_group_size),
                    d_partial_sums);

        cl::copy(queue, d_partial_sums, h_psum.begin(), h_psum.end());

        // complete the sum and compute final integral value
        pi_res = 0.0f;
        for (unsigned int i = 0; i< nwork_groups; i++) {
                pi_res += h_psum[i];
        }
        pi_res = pi_res * step_size;

        //rtime = wtime() - rtime;
        double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.;
        printf("\nThe calculation ran in %lf seconds\n", rtime);
        printf(" pi = %f for %d steps\n", pi_res, nsteps);

        }
        catch (cl::Error err) {
            std::cout << "Exception\n";
            std::cerr
            << "ERROR: "
            << err.what()
            << "("
            << err_code(err.err())
            << ")"
            << std::endl;
        }
}
コード例 #3
0
int main(void)
{
    float *h_psum;					// vector to hold partial sum
    int in_nsteps = INSTEPS;		// default number of steps (updated later to device prefereable)
    int niters = ITERS;				// number of iterations
    int nsteps;
    float step_size;
    ::size_t nwork_groups;
    ::size_t max_size, work_group_size = 8;
    float pi_res;

	cl::Buffer d_partial_sums;

	try
	{
        // Create a context
        cl::Context context(DEVICE);

        // Create the program object
		cl::Program program(context, util::loadProgram("pi_ocl.cl"), true);

		// Get the command queue
        cl::CommandQueue queue(context);

        // Create the kernel object for quering information
		cl::Kernel ko_pi(program, "pi");
        // Get the device we are using
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
        cl::Device device = devices[0];

        // Get the work group size
		work_group_size = ko_pi.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(device);
        //printf("wgroup_size = %lu\n", work_group_size);

		auto pi = cl::make_kernel<int, float, cl::LocalSpaceArg, cl::Buffer>(program, "pi");

		// Now that we know the size of the work_groups, we can set the number of work
		// groups, the actual number of steps, and the step size
		nwork_groups = in_nsteps/(work_group_size*niters);

		if ( nwork_groups < 1) {
			nwork_groups = 
				device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
			work_group_size=in_nsteps / (nwork_groups*niters);
		}

		nsteps = work_group_size * niters * nwork_groups;
		step_size = 1.0f/static_cast<float>(nsteps);
		std::vector<float> h_psum(nwork_groups);

		printf(
			" %d work groups of size %d.  %d Integration steps\n", 
			(int)nwork_groups, 
			(int)work_group_size,
			nsteps);

		d_partial_sums = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups);

		util::Timer timer;

		// Execute the kernel over the entire range of our 1d input data set
		// using the maximum number of work group items for this device
		pi(
			cl::EnqueueArgs(
				queue,
				cl::NDRange(nwork_groups * work_group_size),
				cl::NDRange(work_group_size)), 
			niters,
			step_size,
			cl::Local(sizeof(float) * work_group_size),
			d_partial_sums);

		cl::copy(queue, d_partial_sums, begin(h_psum), end(h_psum));

		// complete the sum and compute final integral value
		pi_res = 0.0f;
		for (unsigned int i = 0; i< nwork_groups; i++) {
			pi_res += h_psum[i];
		}
		pi_res = pi_res * step_size;
	
		//rtime = wtime() - rtime;
		double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.;
		printf("\nThe calculation ran in %lf seconds\n", rtime);
		printf(" pi = %f for %d steps\n", pi_res, nsteps);

	}
	catch (cl::Error err) {
		std::cout << "Exception\n";
		std::cerr 
            << "ERROR: "
            << err.what()
            << "("
            << err_code(err.err())
           << ")"
           << std::endl;
	}
}