Пример #1
0
    void operator()(ThreadParams& params,
            const std::string& name, T_Scalar value,
            const std::string& attrName = "", T_Attribute attribute = T_Attribute())
    {
        log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name;

        // Size over all processes
        Dimensions globalSize(1, 1, 1);
        // Offset for this process
        Dimensions localOffset(0, 0, 0);
        // Offset for all processes
        Dimensions globalOffset(0, 0, 0);

        for (uint32_t d = 0; d < simDim; ++d)
        {
            globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
            localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
        }

        Dimensions localSize(1, 1, 1);

        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
        __getTransactionEvent().waitForFinished();

        typename traits::PICToSplash<T_Scalar>::type splashType;
        params.dataCollector->writeDomain(params.currentStep,            /* id == time step */
                                           globalSize,                   /* total size of dataset over all processes */
                                           localOffset,                  /* write offset for this process */
                                           splashType,                   /* data type */
                                           simDim,                       /* NDims spatial dimensionality of the field */
                                           splash::Selection(localSize), /* data size of this process */
                                           name.c_str(),                 /* data set name */
                                           splash::Domain(
                                                  globalOffset,          /* offset of the global domain */
                                                  globalSize             /* size of the global domain */
                                           ),
                                           DomainCollector::GridType,
                                           &value);

        if(!attrName.empty())
        {
            /*simulation attribute for data*/
            typename traits::PICToSplash<T_Attribute>::type attType;

            log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name;
            params.dataCollector->writeAttribute(params.currentStep,
                                                  attType, name.c_str(),
                                                  attrName.c_str(), &attribute);
        }
    }
Пример #2
0
void Window::GetWindowPixelData(unsigned char*& data, const ax::Rect& rect) const
{
	ax::Point pos(dimension.GetAbsoluteRect().position + rect.position);

	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	/// @todo MOVE TO axGL lib.
	//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	data = new unsigned char[rect.size.x * rect.size.y * 4];
	ax::Size globalSize(ax::App::GetInstance().GetFrameSize());

	glReadPixels(pos.x, globalSize.y - pos.y - rect.size.y, rect.size.x, rect.size.y,
		GL_RGBA, // Format.
		GL_UNSIGNED_BYTE, // Type.
		(void*)data);
}
Пример #3
0
void *quant_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	getTime(&s);
	gmac_sem_post(&s_quant.free, 1);
	nextStage(&s_quant, &s_idct);
	getTime(&t);
	printTime(&s, &t, "Quant:SendRecv: ", "\n");

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("quant", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)       == eclSuccess);
	assert(k.setArg(3, height)      == eclSuccess);
	assert(k.setArg(4, float(1e-6)) == eclSuccess);

	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		assert(k.setArg(0, s_quant.in)  == eclSuccess);
		assert(k.setArg(1, s_quant.out) == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "Quant:Run: ", "\n");

		getTime(&s);
		nextStage(&s_quant, &s_idct);
		getTime(&t);
		printTime(&s, &t, "Quant:SendRecv: ", "\n");
	}

	// Move one stage the pipeline stages the pipeline
	getTime(&s);
	nextStage(&s_quant, &s_idct);
	getTime(&t);
	printTime(&s, &t, "Quant:SendRecv: ", "\n");

	return NULL;
}
Пример #4
0
void *addVector(void *ptr)
{
	float *a, *b;
	float **c = (float **)ptr;
	gmactime_t s, t;
	ecl::error ret;
	getTime(&s);
	// Alloc & init input data
	ret = ecl::malloc((void **)&a, vecSize * sizeof(float));
	assert(ret == eclSuccess);
	ret = ecl::malloc((void **)&b, vecSize * sizeof(float));
	assert(ret == eclSuccess);

	for(unsigned i = 0; i < vecSize; i++) {
		a[i] = 1.f * rand() / RAND_MAX;
		b[i] = 1.f * rand() / RAND_MAX;
	}

	// Alloc output data
	ret = ecl::malloc((void **)c, vecSize * sizeof(float));
	assert(ret == eclSuccess);
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");

	// Call the kernel
	getTime(&s);
	ecl::config localSize(blockSize);
	ecl::config globalSize(vecSize / blockSize);
	if(vecSize % blockSize) globalSize.x++;
	globalSize.x *= localSize.x;

	ecl::kernel kernel("vecAdd", ret);
	assert(ret == eclSuccess);
	ret = kernel.setArg(0, *c);
	assert(ret == eclSuccess);
	ret = kernel.setArg(1, a);
	assert(ret == eclSuccess);
	ret = kernel.setArg(2, b);
	assert(ret == eclSuccess);
	ret = kernel.setArg(3, vecSize);
	assert(ret == eclSuccess);

	ret = kernel.callNDRange(globalSize, localSize);
	assert(ret == eclSuccess);

	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");

	getTime(&s);
	float error = 0;
	for(unsigned i = 0; i < vecSize; i++) {
		error += (*c)[i] - (a[i] + b[i]);
	}
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");
	fprintf(stdout, "Error: %.02f\n", error);

	ecl::free(a);
	ecl::free(b);
	ecl::free(*c);

	return NULL;
}
Пример #5
0
int main(int argc, char *argv[])
{
    cl_uint samples = 256 * 256 * 4;
    size_t blockSizeX = 1;
    size_t blockSizeY = 1;
    cl_float *randArray = NULL;
    cl_float *deviceCallPrice = NULL;
    cl_float *devicePutPrice = NULL;
    cl_float *hostCallPrice = NULL;
    cl_float *hostPutPrice = NULL;
	ecl::error ret;

    cl_uint height = 64;

    /* Calculate width and height from samples */
    samples = samples / 4;
    samples = (samples / GROUP_SIZE)? (samples / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE;

    cl_uint tempVar1 = (cl_uint)sqrt((double)samples);
    tempVar1 = (tempVar1 / GROUP_SIZE)? (tempVar1 / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE;
    samples = tempVar1 * tempVar1;

    width = tempVar1;
    height = width;
    ret = ecl::compileSource(code);
	assert(ret == eclSuccess);
    setParam<cl_uint>(&width, widthStr, widthDefault);

    // Alloc & init input data
    randArray = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    deviceCallPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    devicePutPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    assert(randArray != NULL);
    assert(deviceCallPrice != NULL);
    assert(devicePutPrice != NULL);
    hostCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
    if(hostCallPrice == NULL)
        return 0;
    hostPutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
    if(hostPutPrice == NULL) {
        free(hostCallPrice);
        return 0;
    }

    // random initialisation of input
    for(cl_uint i = 0; i < width * height * 4; i++)
        randArray[i] = (float)rand() / (float)RAND_MAX;

    eclMemset(deviceCallPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(devicePutPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(hostCallPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(hostPutPrice, 0, width * height * sizeof(cl_float4));

    // Call the kernel
    ecl::config globalSize(width, height);
    ecl::config localSize(blockSizeX, blockSizeY);
	ecl::config globalWorkOffset(0); 
    ecl::kernel kernel("blackScholes", ret);
    assert(ret == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
    ret = kernel.setArg(0, randArray);
	assert(ret == eclSuccess);
    ret = kernel.setArg(1, width);
	assert(ret == eclSuccess);
    ret = kernel.setArg(2, deviceCallPrice);
	assert(ret == eclSuccess);
    ret = kernel.setArg(3, devicePutPrice);
	assert(ret == eclSuccess);
    ret = kernel.callNDRange(globalSize, localSize, globalWorkOffset);
	assert(ret == eclSuccess);
#else
    ret = kernel(globalSize, localSize)(randArray, width, deviceCallPrice, devicePutPrice);
	assert(ret == eclSuccess);
#endif

    printf("deviceCallPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", deviceCallPrice[i]);
    }
    printf("\ndevicePutPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", devicePutPrice[i]);
    }

    blackScholesCPU(randArray, width, height, hostCallPrice, hostPutPrice);
    printf("\nhostCallPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", hostCallPrice[i]);
    }
    printf("\nhostPutPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", hostPutPrice[i]);
    }

    float error = 0.0f;
    float ref = 0.0f;
    bool callPriceResult = true;
    bool putPriceResult = true;
    float normRef;

    for(cl_uint i = 1; i < width * height * 4; ++i) {
        float diff = hostCallPrice[i] - deviceCallPrice[i];
        error += diff * diff;
        ref += hostCallPrice[i] * deviceCallPrice[i];
    }

    normRef =::sqrtf((float) ref);
    if (::fabs((float) ref) < 1e-7f) {
        callPriceResult = false;
    }
    if(callPriceResult) {
        float normError = ::sqrtf((float) error);
        error = normError / normRef;
        callPriceResult = error < 1e-6f;
    }

    for(cl_uint i = 1; i < width * height * 4; ++i) {
        float diff = hostPutPrice[i] - devicePutPrice[i];
        error += diff * diff;
        ref += hostPutPrice[i] * devicePutPrice[i];
    }

    normRef =::sqrtf((float) ref);
    if (::fabs((float) ref) < 1e-7f) {
        putPriceResult = false;
    }
    if(putPriceResult) {
        float normError = ::sqrtf((float) error);
        error = normError / normRef;
        putPriceResult = error < 1e-4f;
    }

    if(!(callPriceResult ? (putPriceResult ? true : false) : false)) {
        printf("Failed!\n");
    } else {
        printf("Passed!\n");
    }

    free(hostPutPrice);
    hostPutPrice = NULL;
    free(hostCallPrice);
    hostCallPrice = NULL;

    ecl::free(devicePutPrice);
    ecl::free(deviceCallPrice);
    ecl::free(randArray);

    return 0;
}
Пример #6
0
int memcpyTest(MemcpyType type, bool callKernel, void *(*memcpy_fn)(void *, const void *, size_t n))
{
	int error = 0;

	ecl::config globalSize (1);
	ecl::config localSize (1);

	ecl::error ret;
	ecl::kernel kernel("null", ret);
	assert(ret == eclSuccess);

	uint8_t *baseSrc = NULL;
	uint8_t *eclSrc = NULL;
	uint8_t *eclDst = NULL;

	baseSrc = (uint8_t *)malloc(maxCount);
	init(baseSrc, int(maxCount), 0xca);
	for (size_t count = minCount; count <= maxCount; count *= 2) {
		fprintf(stderr, "ALLOC: "FMT_SIZE"\n", count);

		if (type == GMAC_TO_GMAC) {
			assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess);
			assert(ecl::malloc((void **)&eclDst, count) == eclSuccess);
		} else if (type == HOST_TO_GMAC) {
			eclSrc = (uint8_t *)malloc(count);
			assert(ecl::malloc((void **)&eclDst, count) == eclSuccess);
		} else if (type == GMAC_TO_HOST) {
			assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess);
			eclDst = (uint8_t *)malloc(count);
		}

		for (size_t stride = 0, i = 1; stride < count/3; stride = i, i =  i * 2 - (i == 1? 0: 1)) {
			for (size_t copyCount = 1; copyCount < count/3; copyCount *= 2) {
				init(eclSrc + stride, int(copyCount), 0xca);
				if (stride == 0) {
					init(eclDst + stride, int(copyCount) + 1, 0);
				} else {
					init(eclDst + stride - 1, int(copyCount) + 2, 0);
				}
				assert(stride + copyCount <= count);

				if (callKernel) {
					ret = kernel.callNDRange(globalSize, localSize);
					assert(ret == eclSuccess);
				}
				memcpy_fn(eclDst + stride, eclSrc + stride, copyCount);

				int ret = memcmp(eclDst + stride, baseSrc + stride, copyCount);
				if (stride == 0) {
					ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0);
				} else {
					ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0);
				}

				if (ret != 0) {
#if 0
					fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd\n",
						count    ,
						stride   ,
						copyCount);
#endif
					abort();
					error = 1;
					goto exit_test;
				}
#if 0
				for (unsigned k = 0; k < count; k++) {
					int ret = baseDst[k] != eclDst[k];
					if (ret != 0) {
						fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd. Pos %u\n", count    ,
							stride   ,
							copyCount, k);
						error = 1;
					}
				}
#endif
			}
		}

		if (type == GMAC_TO_GMAC) {
			assert(ecl::free(eclSrc) == eclSuccess);
			assert(ecl::free(eclDst) == eclSuccess);
		} else if (type == HOST_TO_GMAC) {
			free(eclSrc);
			assert(ecl::free(eclDst) == eclSuccess);
		} else if (type == GMAC_TO_HOST) {
			assert(ecl::free(eclSrc) == eclSuccess);
			free(eclDst);
		}
	}
	free(baseSrc);

	return error;

exit_test:
	if (type == GMAC_TO_GMAC) {
		assert(ecl::free(eclSrc) == eclSuccess);
		assert(ecl::free(eclDst) == eclSuccess);
	} else if (type == HOST_TO_GMAC) {
		free(eclSrc);
		assert(ecl::free(eclDst) == eclSuccess);
	} else if (type == GMAC_TO_HOST) {
		assert(ecl::free(eclSrc) == eclSuccess);
		free(eclDst);
	}

	free(baseSrc);

	return error;
}
Пример #7
0
void *dct_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("dct", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)  == eclSuccess);
	assert(k.setArg(3, height) == eclSuccess);


	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		s_dct.in = new (ecl::allocator) float[width * height];
		assert(s_dct.in != NULL);
		s_dct.out = new (ecl::allocator) float[width * height];
		assert(s_dct.out != NULL);
		getTime(&t);
		printTime(&s, &t, "DCT:Malloc: ", "\n");

		getTime(&s);
		__randInit(s_dct.in, width * height);
		getTime(&t);
		printTime(&s, &t, "DCT:Init: ", "\n");

		getTime(&s);
		assert(k.setArg(0, s_dct.out)    == eclSuccess);
		assert(k.setArg(1, s_dct.in)     == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "DCT:Run: ", "\n");

		getTime(&s);
		gmac_sem_wait(&s_quant.free, 1);
		s_quant.next_in = s_dct.out;
		s_quant.next_out = s_dct.in;
		ecl::deviceSendReceive(s_quant.id);
		getTime(&t);
		printTime(&s, &t, "DCT:SendRecv: ", "\n");
	}

	getTime(&s);
	s_dct.in = new (ecl::allocator) float[width * height];
	assert(s_dct.in != NULL);
	s_dct.out = new (ecl::allocator) float[width * height];
	assert(s_dct.out != NULL);
	getTime(&t);
	printTime(&s, &t, "DCT:Malloc: ", "\n");

	getTime(&s);
	gmac_sem_wait(&s_quant.free, 1);
	s_quant.next_in = s_dct.out;
	s_quant.next_out = s_dct.in;
	ecl::deviceSendReceive(s_quant.id);
	getTime(&t);
	printTime(&s, &t, "DCT:SendRecv: ", "\n");

	getTime(&s);
	s_dct.in = new (ecl::allocator) float[width * height];
	assert(s_dct.in != NULL);
	s_dct.out = new (ecl::allocator) float[width * height];
	assert(s_dct.out != NULL);
	getTime(&t);
	printTime(&s, &t, "DCT:Malloc: ", "\n");

	getTime(&s);
	gmac_sem_wait(&s_quant.free, 1);
	s_quant.next_in = s_dct.out;
	s_quant.next_out = s_dct.in;
	ecl::deviceSendReceive(s_quant.id);
	getTime(&t);
	printTime(&s, &t, "DCT:SendRecv: ", "\n");

	return NULL;
}
Пример #8
0
void *idct_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	getTime(&s);
	gmac_sem_post(&s_idct.free, 1);
	ecl::deviceSendReceive(s_dct.id);
	nextStage(&s_idct, NULL);
	getTime(&t);
	printTime(&s, &t, "IDCT:SendRecv: ", "\n");

	getTime(&s);
	gmac_sem_post(&s_idct.free, 1);
	ecl::deviceSendReceive(s_dct.id);
	getTime(&t);
	nextStage(&s_idct, NULL);
	getTime(&t);
	printTime(&s, &t, "IDCT:SendRecv: ", "\n");

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("idct", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)   == eclSuccess);
	assert(k.setArg(3, height)  == eclSuccess);

	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		assert(k.setArg(0, s_idct.in)  == eclSuccess);
		assert(k.setArg(1, s_idct.out) == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "IDCT:Run: ", "\n");

		getTime(&s);
		assert(ecl::free(s_idct.in) == eclSuccess);
		assert(ecl::free(s_idct.out) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "IDCT:Free: ", "\n");

		getTime(&s);
		ecl::deviceSendReceive(s_dct.id);
		nextStage(&s_idct, NULL);
		getTime(&t);
		printTime(&s, &t, "IDCT:SendRecv: ", "\n");
	}

	getTime(&s);
	ecl::free(s_idct.in);
	ecl::free(s_idct.out);
	getTime(&t);
	printTime(&s, &t, "IDCT:Free: ", "\n");

	return NULL;
}
void StepWorldV3OpenCL(world_t &world, float dt, unsigned n)
{
  // OpenCL setup
  // platform
  std::vector<cl::Platform> platforms;

  cl::Platform::get(&platforms);
  if (platforms.size() == 0)
    throw std::runtime_error("No OpenCL platforms found.");

  std::cerr << "Found " << platforms.size() << " platforms\n";
  for (unsigned i = 0; i < platforms.size(); i++) {
    std::string vendor = platforms[i].getInfo<CL_PLATFORM_VENDOR>();
    std::cerr << "  Platform " << i << " : " << vendor << "\n";
  }

  int selectedPlatform = 0;
  if (getenv("HPCE_SELECT_PLATFORM")) {
    selectedPlatform = atoi(getenv("HPCE_SELECT_PLATFORM"));
  }
  std::cerr << "Choosing platform " << selectedPlatform << "\n";
  cl::Platform platform = platforms.at(selectedPlatform);

  // device
  std::vector<cl::Device> devices;
  platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
  if (devices.size() == 0) {
    throw std::runtime_error("No opencl devices found.\n");
  }

  std::cerr << "Found " << devices.size() << " devices\n";
  for (unsigned i = 0; i < devices.size(); i++) {
    std::string name = devices[i].getInfo<CL_DEVICE_NAME>();
    std::cerr << "  Device " << i << " : " << name << "\n";
  }

  int selectedDevice = 0;
  if (getenv("HPCE_SELECT_DEVICE")) {
    selectedDevice = atoi(getenv("HPCE_SELECT_DEVICE"));
  }
  std::cerr << "Choosing device " << selectedDevice << "\n";
  cl::Device device = devices.at(selectedDevice);

  // context
  cl::Context context(devices);

  std::string kernelSource = LoadSource("step_world_v3_kernel.cl");

  cl::Program::Sources sources;  // A vector of (data,length) pairs
  sources.push_back(
      std::make_pair(kernelSource.c_str(),
                     kernelSource.size() + 1));  // push on our single string

  cl::Program program(context, sources);
  try { program.build(devices); }
  catch (...)
  {
    for (unsigned i = 0; i < devices.size(); i++) {
      std::cerr << "Log for device " << devices[i].getInfo<CL_DEVICE_NAME>()
                << ":\n\n";
      std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[i])
                << "\n\n";
    }
    throw;
  }

  size_t cbBuffer = 4 * world.w * world.h;
  cl::Buffer buffProperties(context, CL_MEM_READ_ONLY, cbBuffer);
  cl::Buffer buffState(context, CL_MEM_READ_ONLY, cbBuffer);
  cl::Buffer buffBuffer(context, CL_MEM_WRITE_ONLY, cbBuffer);

  cl::Kernel kernel(program, "kernel_xy");

  float outer = world.alpha * dt;  // We spread alpha to other cells per time
  float inner = 1 - outer / 4;     // Anything that doesn't spread stays

  kernel.setArg(0, inner);
  kernel.setArg(1, outer);
  kernel.setArg(2, buffState);
  kernel.setArg(3, buffProperties);
  kernel.setArg(4, buffBuffer);

  cl::CommandQueue queue(context, device);

  queue.enqueueWriteBuffer(buffProperties, CL_TRUE, 0, cbBuffer,
                           &world.properties[0]);

  unsigned w = world.w, h = world.h;
  // This is our temporary working space
  std::vector<float> buffer(w * h);

  cl::NDRange offset(0, 0);      // Always start iterations at x=0, y=0
  cl::NDRange globalSize(w, h);  // Global size must match the original loops
  cl::NDRange localSize = cl::NullRange;  // We don't care about local size

  for (unsigned t = 0; t < n; t++) {
    cl::Event evCopiedState;
    queue.enqueueWriteBuffer(buffState, CL_FALSE, 0, cbBuffer, &world.state[0],
                             NULL, &evCopiedState);

    std::vector<cl::Event> kernelDependencies(1, evCopiedState);
    cl::Event evExecutedKernel;
    queue.enqueueNDRangeKernel(kernel, offset, globalSize, localSize,
                               &kernelDependencies, &evExecutedKernel);

    std::vector<cl::Event> copyBackDependencies(1, evExecutedKernel);
    queue.enqueueReadBuffer(buffBuffer, CL_TRUE, 0, cbBuffer, &buffer[0],
                            &copyBackDependencies);

    std::swap(world.state, buffer);
    world.t += dt;  // We have moved the world forwards in time
  }
}
Пример #10
0
int main(int argc, char *argv[])
{
	float *a, *b, *c;
	gmactime_t s, t;
	ecl::error err;

	assert(ecl::compileSource(kernel) == eclSuccess);

	float * orig = (float *) malloc(vecSize * sizeof(float));
	std::ifstream o_file(VECTORC);
	o_file.read((char *)orig, vecSize * sizeof(float));
	o_file.close();

	getTime(&s);
	// Alloc & init input data
	assert(ecl::malloc((void **)&a, vecSize * sizeof(float)) == eclSuccess);
	assert(ecl::malloc((void **)&b, vecSize * sizeof(float)) == eclSuccess);
	assert(ecl::malloc((void **)&c, vecSize * sizeof(float)) == eclSuccess);
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");

	std::ifstream a_file(VECTORA);
	std::ifstream b_file(VECTORB);

	getTime(&s);
	a_file.read((char *)a, vecSize * sizeof(float));
	a_file.close();
	b_file.read((char *)b, vecSize * sizeof(float));
	b_file.close();
	getTime(&t);
	printTime(&s, &t, "Init: ", "\n");

	// Call the kernel
	getTime(&s);
	ecl::config localSize (blockSize);
	ecl::config globalSize (vecSize / blockSize);
	if(vecSize % blockSize) globalSize.x++;
	globalSize.x *= localSize.x;

	ecl::kernel kernel("vecAdd", err);
	assert(err == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
	err = kernel.setArg(0, c);
	assert(err == eclSuccess);
	err = kernel.setArg(1, a);
	assert(err == eclSuccess);
	err = kernel.setArg(2, b);
	assert(err == eclSuccess);
	err = kernel.setArg(3, vecSize);
	assert(err == eclSuccess);
	err = kernel.callNDRange(globalSize, localSize);
	assert(err == eclSuccess);
#else
	assert(kernel(c, a, b, vecSize)(globalSize, localSize) == eclSuccess);
#endif
	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");

	getTime(&s);
	float error = 0.f;
	for(unsigned i = 0; i < vecSize; i++) {
		error += orig[i] - (c[i]);
	}
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");

	getTime(&s);
	std::ofstream c_file("vectorC_shared");
	c_file.write((char *)c, vecSize * sizeof(float));
	c_file.close();
	getTime(&t);
	printTime(&s, &t, "Write: ", "\n");

	getTime(&s);
	ecl::free(a);
	ecl::free(b);
	ecl::free(c);
	getTime(&t);
	printTime(&s, &t, "Free: ", "\n");

	return error != 0;
}
Пример #11
0
int main(int argc, char *argv[]){
 
    std::string line;
    std::string kersource="";
    std::ifstream myfile ("Matmul.cl");
    if (myfile.is_open())
    {
        while ( getline (myfile,line) )
        {
            kersource=kersource+line;
            kersource=kersource+"\n";
        }
        myfile.close();
    }
  
    const char* kernelSource  = kersource.c_str();
    
    unsigned int n = 1000;
 
    // Host input vectors
    float *h_a;
    float *h_b;
    // Host output vector
    float *h_c;
 
    // Device input buffers
    cl::Buffer d_a;
    cl::Buffer d_b;
    // Device output buffer
    cl::Buffer d_c;
    cl::LocalSpaceArg d_bwrk;
 
    // Size, in bytes, of each vector
    size_t bytes = n*n*sizeof (float);
 
    // Allocate memory for each vector on host
    h_a = new float[n*n];
    h_b = new float[n*n];
    h_c = new float[n*n];
 
    // Initialize vectors on host
    for(int i = 0; i < n*n; i++ )
    {
        h_a[i] = 1;
        h_b[i] = 2;
    }
    cl::STRING_CLASS buildlog;
    cl::Program program_;
    std::vector<cl::Device> devices;

    cl_int err = CL_SUCCESS;
    try {
 
        // Query platforms
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.size() == 0) {
            std::cout << "Platform size 0\n";
            return -1;
        }
        

 
        // Get list of devices on default platform and create context
        cl_context_properties properties[] =
           { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
        cl::Context context(CL_DEVICE_TYPE_CPU, properties);
         devices = context.getInfo<CL_CONTEXT_DEVICES>();


 
        // Create command queue for first device
        cl::CommandQueue queue(context, devices[0], 0, &err);
 
        // Create device memory buffers
        d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
        d_bwrk = cl::Local(n*sizeof(float));
 
        // Bind memory buffers
        queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
        queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
 
        //Build kernel from source string
        cl::Program::Sources source(1,
            std::make_pair(kernelSource,strlen(kernelSource)));
        program_ = cl::Program(context, source);
        program_.build(devices);

        
        

        std::cout<<"BuildLog: \n"<<buildlog;
        

        // Create kernel object
        cl::Kernel kernel(program_, "multiMat", &err);
 
        // Bind kernel arguments to kernel
        kernel.setArg(0, d_a);
        kernel.setArg(1, d_b);
        kernel.setArg(2, d_c);
        kernel.setArg(3, n);
        kernel.setArg(4,d_bwrk);
 
        // Number of work items in each local work group
        cl::NDRange localSize(64);
        // Number of total work items - localSize must be devisor
        cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
 
        // Enqueue kernel
        cl::Event event;
        queue.enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            globalSize,
            localSize,
            NULL,
            &event);
 
        // Block until kernel completion
        event.wait();
 
        // Read back d_c
        queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
        }
    catch (cl::Error err) 
    {
        std::cerr
        << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;

        buildlog = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0], NULL);
        std::ofstream logfile ("Matmullog.txt");
        logfile<<buildlog;
        logfile.close();
        
    }
    std::cout<<"Global Size side :"<< (int)(ceil(n/(float)64)*64)<<"\n";
 
    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(int i=0; i<n*n; i++)
        sum += h_c[i];
    std::cout<<"final result: "<<sum<<std::endl;

    //std::ofstream outfile ("MatmulAns.txt");
    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++)
        {
    //        outfile<<h_c[i*n+j]<<" ";
        }
    //    outfile<<"\n";
    }
    //outfile.close();
    // Release host memory
    delete(h_a);
    delete(h_b);
    delete(h_c);
 
    return 0;
}
Пример #12
0
int main(int argc, char *argv[])
{
	gmactime_t s, t, S, T; 

	cl_float* randArray = NULL;
	cl_float* output = NULL;
	cl_float* refOutput;
	cl_int numSamples = 64;
	getTime(&S);
	getTime(&s);
	assert(ecl::compileSource(code) == eclSuccess);
	setParam<cl_int>(&numSteps, numStepsStr, numStepsDefault);
	// Alloc & init data
	randArray = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)];
	output = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)];
	assert(randArray != NULL);
	assert(output != NULL);
	refOutput = (float*)malloc(numSamples * sizeof(cl_float4));
	if(refOutput == NULL)
		return 0;
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");
	getTime(&s);
	/* random initialisation of input */
	for(int i = 0; i < numSamples * 4; i++) {
		randArray[i] = (float)rand() / (float)RAND_MAX;
	}
	valueInit(output, 0, numSamples * 4);
	getTime(&t);
	printTime(&s, &t, "Init: ", "\n");
	getTime(&s);
	ecl::config globalSize(numSamples * (numSteps + 1));
	ecl::config localSize(numSteps + 1);

	ecl::error err;
	ecl::kernel kernel("binomial_options", err);
	assert(err == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
	assert(kernel.setArg(0, numSteps) == eclSuccess);
	assert(kernel.setArg(1, randArray) == eclSuccess);
	assert(kernel.setArg(2, output) == eclSuccess);
	assert(kernel.setArg(3, (cl_float4 *)NULL) == eclSuccess);
	assert(kernel.setArg(4, (cl_float4 *)NULL) == eclSuccess);
	assert(kernel.callNDRange(globalSize, localSize) == eclSuccess);
#else
	assert(kernel(globalSize, localSize)(numSteps, randArray, output, NULL, NULL) == eclSuccess);
#endif
	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");
	printf("Output: ");
	for(int i = 0; i < numSamples; i++) {
		printf("%f ", output[i]);
	}

	getTime(&s);
	bool result = 1;
	binomialOptionCPUReference(refOutput, randArray, numSamples, numSteps);
	float error = 0.0f;
	float ref = 0.0f;

	for(int i = 1; i < numSamples; ++i) {
		float diff = output[i] - refOutput[i];
		error += diff * diff;
		ref += output[i] * output[i];
	}

	float normRef =::sqrtf((float) ref);
	if (::fabs((float) ref) < 1e-7f) {
		result = 0;
	}
	if(result) {
		float normError = ::sqrtf((float) error);
		error = normError / normRef;
		result = error < 0.001f;
	}
	if(result)
		printf("\nPassed!\n");
	else
		printf("\nFailed!\n");
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");
	getTime(&T);
	printTime(&S, &T, "Total: ", "\n");
	getTime(&s);
	free(refOutput);
	refOutput = NULL;
	ecl::free(randArray);
	ecl::free(output);
	getTime(&t);
	printTime(&s, &t, "Free: ", "\n");
	return 0;
}
void StepWorldV4DoubleBuffered(world_t &world, float dt, unsigned n)
{
    // Get platforms
    std::vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    if (platforms.size() == 0)
        throw std::runtime_error("No OpenCL platforms found.");

#ifdef DEBUG
    std::cerr << "Found " << platforms.size() << " platforms\n";
    for (unsigned i = 0; i < platforms.size(); i++)
    {
        std::string vendor = platforms[0].getInfo<CL_PLATFORM_VENDOR>();
        std::cerr << "  Platform " << i << " : " << vendor << "\n";
    }
#endif

    // select platforms
    int selectedPlatform = 0;
    if (getenv("HPCE_SELECT_PLATFORM"))
    {
        selectedPlatform = atoi(getenv("HPCE_SELECT_PLATFORM"));
    }

#ifdef DEBUG
    std::cerr << "Choosing platform " << selectedPlatform << "\n";
#endif

    cl::Platform platform = platforms.at(selectedPlatform);

    // Get devices
    std::vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
    if (devices.size() == 0)
    {
        throw std::runtime_error("No opencl devices found.\n");
    }

#ifdef DEBUG
    std::cerr << "Found " << devices.size() << " devices\n";
    for (unsigned i = 0; i < devices.size(); i++)
    {
        std::string name = devices[i].getInfo<CL_DEVICE_NAME>();
        std::cerr << "  Device " << i << " : " << name << "\n";
    }
#endif

    // Select device
    int selectedDevice = 0;
    if (getenv("HPCE_SELECT_DEVICE"))
    {
        selectedDevice = atoi(getenv("HPCE_SELECT_DEVICE"));
    }
#ifdef DEBUG
    std::cerr << "Choosing device " << selectedDevice << "\n";
#endif

    cl::Device device = devices.at(selectedDevice);

    // Create context
    cl::Context context(devices);

    // Load kernel to string
    std::string kernelSource = LoadSource("step_world_v3_kernel.cl");

    // Load kernel to sources
    cl::Program::Sources sources;   // A vector of (data,length) pairs
    sources.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size() + 1)); // push on our single string

    // Create program from context
    cl::Program program(context, sources);

#ifdef DEBUG
    try
    {
        program.build(devices);
    }
    catch (...)
    {
        for (unsigned i = 0; i < devices.size(); i++)
        {
            std::cerr << "Log for device " << devices[i].getInfo<CL_DEVICE_NAME>() << ":\n\n";
            std::cerr << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[i]) << "\n\n";
        }
        throw;
    }
#else
    program.build(devices);
#endif

    size_t cbBuffer = 4 * world.w * world.h;
    cl::Buffer buffProperties(context, CL_MEM_READ_ONLY, cbBuffer);
    cl::Buffer buffState(context, CL_MEM_READ_WRITE, cbBuffer);
    cl::Buffer buffBuffer(context, CL_MEM_READ_WRITE, cbBuffer);

    cl::Kernel kernel(program, "kernel_xy");

    unsigned w = world.w, h = world.h;

    float outer = world.alpha * dt; // We spread alpha to other cells per time
    float inner = 1 - outer / 4;        // Anything that doesn't spread stays

    kernel.setArg(0, inner);
    kernel.setArg(1, outer);

    kernel.setArg(3, buffProperties);


    cl::CommandQueue queue(context, device);

    queue.enqueueWriteBuffer(buffProperties, CL_TRUE, 0, cbBuffer, &world.properties[0]);
    queue.enqueueWriteBuffer(buffState, CL_TRUE, 0, cbBuffer, &world.state[0]);

    cl::NDRange offset(0, 0);               // Always start iterations at x=0, y=0
    cl::NDRange globalSize(w, h);   // Global size must match the original loops
    cl::NDRange localSize = cl::NullRange;  // We don't care about local size

    for (unsigned t = 0; t < n; t++)
    {
        kernel.setArg(2, buffState);
        kernel.setArg(4, buffBuffer);

        queue.enqueueNDRangeKernel(kernel, offset, globalSize, localSize);
        queue.enqueueBarrier();

        // queue.enqueueCopyBuffer(buffBuffer, buffState, 0, 0, cbBuffer, 0, NULL);
        // queue.enqueueBarrier();

        std::swap(buffState, buffBuffer);
        // queue.enqueueReadBuffer(buffBuffer, CL_TRUE, 0, cbBuffer, &buffer[0]);
        // Swapping rather than assigning is cheaper: just a pointer swap
        // rather than a memcpy, so O(1) rather than O(w*h)

        world.t += dt; // We have moved the world forwards in time

    } // end of for(t...

    // This is our temporary working space
    queue.enqueueReadBuffer(buffState, CL_TRUE, 0, cbBuffer, &world.state[0]);
}
Пример #14
0
/**
 * Returns the size of the resource
 */
unsigned int RMRes::size() {
    return globalSize(_h);
}
Пример #15
0
std::tr1::tuple<cl::Kernel,cl::Kernel,std::vector<cl::Buffer*>,cl::CommandQueue,cl::NDRange,cl::NDRange,cl::NDRange> init_cl(int levels, unsigned w, unsigned h, unsigned bits, std::string source, int deviceNumber)
{
    std::vector<cl::Platform> platforms;
    
	cl::Platform::get(&platforms);
	
    if(platforms.size()==0) throw std::runtime_error("No OpenCL platforms found.");
    
    std::cerr<<"Found "<<platforms.size()<<" platforms\n";
	for(unsigned i=0;i<platforms.size();i++){
		std::string vendor=platforms[0].getInfo<CL_PLATFORM_VENDOR>();
		std::cerr<<"  Platform "<<i<<" : "<<vendor<<"\n";
	}
    
    int selectedPlatform=0;
	if(getenv("HPCE_SELECT_PLATFORM")){
		selectedPlatform=atoi(getenv("HPCE_SELECT_PLATFORM"));
	}
	std::cerr<<"Choosing platform "<<selectedPlatform<<"\n";
	cl::Platform platform=platforms.at(selectedPlatform);
    
    std::vector<cl::Device> devices;
	platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
	if(devices.size()==0){
		throw std::runtime_error("No opencl devices found.\n");
	}
    
	std::cerr<<"Found "<<devices.size()<<" devices\n";
	for(unsigned i=0;i<devices.size();i++){
		std::string name=devices[i].getInfo<CL_DEVICE_NAME>();
		std::cerr<<"  Device "<<i<<" : "<<name<<"\n";
	}
    
    int selectedDevice=0;
    
    if (deviceNumber != -1) selectedDevice = deviceNumber;
    
	std::cerr<<"Choosing device "<<selectedDevice<<"\n";
	cl::Device device=devices.at(selectedDevice);
    
    cl::Context context(devices);
    
    std::string kernelSource=LoadSource(source.c_str());
	
	cl::Program::Sources sources;	// A vector of (data,length) pairs
	sources.push_back(std::make_pair(kernelSource.c_str(), kernelSource.size()+1));	// push on our single string
    
	cl::Program program(context, sources);
	try{
		program.build(devices);
	}catch(...){
		for(unsigned i=0;i<devices.size();i++){
			std::cerr<<"Log for device "<<devices[i].getInfo<CL_DEVICE_NAME>()<<":\n\n";
			std::cerr<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[i])<<"\n\n";
		}
		throw;
	}
    
    size_t cbBuffer= (w*bits)/2;
    
    std::vector<cl::Buffer*> gpuBuffers;
    
    for (int i=0; i<abs(levels); i++)
    {
        gpuBuffers.push_back(new cl::Buffer(context, CL_MEM_READ_WRITE, cbBuffer));
        gpuBuffers.push_back(new cl::Buffer(context, CL_MEM_READ_WRITE, cbBuffer));
    }
    gpuBuffers.push_back(new cl::Buffer(context, CL_MEM_READ_WRITE, cbBuffer)); // ... and one for luck.
    
    std::string erodeKernelName;
    std::string dilateKernelName;
    
    switch (bits) {
        case 1:
            erodeKernelName = "erode_kernel_1";
            dilateKernelName = "dilate_kernel_1";
            break;
        case 2:
            erodeKernelName = "erode_kernel_2";
            dilateKernelName = "dilate_kernel_2";
            break;
        case 4:
            erodeKernelName = "erode_kernel_4";
            dilateKernelName = "dilate_kernel_4";
            break;
        case 8:
            erodeKernelName = "erode_kernel_8";
            dilateKernelName = "dilate_kernel_8";
            break;
        case 16:
            erodeKernelName = "erode_kernel_16";
            dilateKernelName = "dilate_kernel_16";
            break;
        case 32:
            erodeKernelName = "erode_kernel_32";
            dilateKernelName = "dilate_kernel_32";
            break;
        default:
            break;
    }
    
    cl::Kernel erodeKernel(program, erodeKernelName.c_str());
    cl::Kernel dilateKernel(program, dilateKernelName.c_str());
    
    cl::CommandQueue queue(context, device);
    
    cl::NDRange offset(0, 0);				 // Always start iterations at x=0, y=0
    cl::NDRange globalSize((w*bits)/32, 1);  // Global size must match the original loops
    cl::NDRange localSize=cl::NullRange;	 // We don't care about local size
    
    return std::tr1::make_tuple(erodeKernel,dilateKernel,gpuBuffers,queue,offset,globalSize,localSize);
}