Esempio n. 1
0
    void operator()(ThreadParams& params,
            const std::string& name, T_Scalar value,
            const std::string& attrName = "", T_Attribute attribute = T_Attribute())
    {
        log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name;

        // Size over all processes
        Dimensions globalSize(1, 1, 1);
        // Offset for this process
        Dimensions localOffset(0, 0, 0);
        // Offset for all processes
        Dimensions globalOffset(0, 0, 0);

        for (uint32_t d = 0; d < simDim; ++d)
        {
            globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
            localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
        }

        Dimensions localSize(1, 1, 1);

        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
        __getTransactionEvent().waitForFinished();

        typename traits::PICToSplash<T_Scalar>::type splashType;
        params.dataCollector->writeDomain(params.currentStep,            /* id == time step */
                                           globalSize,                   /* total size of dataset over all processes */
                                           localOffset,                  /* write offset for this process */
                                           splashType,                   /* data type */
                                           simDim,                       /* NDims spatial dimensionality of the field */
                                           splash::Selection(localSize), /* data size of this process */
                                           name.c_str(),                 /* data set name */
                                           splash::Domain(
                                                  globalOffset,          /* offset of the global domain */
                                                  globalSize             /* size of the global domain */
                                           ),
                                           DomainCollector::GridType,
                                           &value);

        if(!attrName.empty())
        {
            /*simulation attribute for data*/
            typename traits::PICToSplash<T_Attribute>::type attType;

            log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name;
            params.dataCollector->writeAttribute(params.currentStep,
                                                  attType, name.c_str(),
                                                  attrName.c_str(), &attribute);
        }
    }
Esempio n. 2
0
void *quant_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	getTime(&s);
	gmac_sem_post(&s_quant.free, 1);
	nextStage(&s_quant, &s_idct);
	getTime(&t);
	printTime(&s, &t, "Quant:SendRecv: ", "\n");

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("quant", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)       == eclSuccess);
	assert(k.setArg(3, height)      == eclSuccess);
	assert(k.setArg(4, float(1e-6)) == eclSuccess);

	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		assert(k.setArg(0, s_quant.in)  == eclSuccess);
		assert(k.setArg(1, s_quant.out) == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "Quant:Run: ", "\n");

		getTime(&s);
		nextStage(&s_quant, &s_idct);
		getTime(&t);
		printTime(&s, &t, "Quant:SendRecv: ", "\n");
	}

	// Move one stage the pipeline stages the pipeline
	getTime(&s);
	nextStage(&s_quant, &s_idct);
	getTime(&t);
	printTime(&s, &t, "Quant:SendRecv: ", "\n");

	return NULL;
}
Esempio n. 3
0
void *addVector(void *ptr)
{
	float *a, *b;
	float **c = (float **)ptr;
	gmactime_t s, t;
	ecl::error ret;
	getTime(&s);
	// Alloc & init input data
	ret = ecl::malloc((void **)&a, vecSize * sizeof(float));
	assert(ret == eclSuccess);
	ret = ecl::malloc((void **)&b, vecSize * sizeof(float));
	assert(ret == eclSuccess);

	for(unsigned i = 0; i < vecSize; i++) {
		a[i] = 1.f * rand() / RAND_MAX;
		b[i] = 1.f * rand() / RAND_MAX;
	}

	// Alloc output data
	ret = ecl::malloc((void **)c, vecSize * sizeof(float));
	assert(ret == eclSuccess);
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");

	// Call the kernel
	getTime(&s);
	ecl::config localSize(blockSize);
	ecl::config globalSize(vecSize / blockSize);
	if(vecSize % blockSize) globalSize.x++;
	globalSize.x *= localSize.x;

	ecl::kernel kernel("vecAdd", ret);
	assert(ret == eclSuccess);
	ret = kernel.setArg(0, *c);
	assert(ret == eclSuccess);
	ret = kernel.setArg(1, a);
	assert(ret == eclSuccess);
	ret = kernel.setArg(2, b);
	assert(ret == eclSuccess);
	ret = kernel.setArg(3, vecSize);
	assert(ret == eclSuccess);

	ret = kernel.callNDRange(globalSize, localSize);
	assert(ret == eclSuccess);

	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");

	getTime(&s);
	float error = 0;
	for(unsigned i = 0; i < vecSize; i++) {
		error += (*c)[i] - (a[i] + b[i]);
	}
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");
	fprintf(stdout, "Error: %.02f\n", error);

	ecl::free(a);
	ecl::free(b);
	ecl::free(*c);

	return NULL;
}
int main(int argc, char *argv[])
{
    cl_uint samples = 256 * 256 * 4;
    size_t blockSizeX = 1;
    size_t blockSizeY = 1;
    cl_float *randArray = NULL;
    cl_float *deviceCallPrice = NULL;
    cl_float *devicePutPrice = NULL;
    cl_float *hostCallPrice = NULL;
    cl_float *hostPutPrice = NULL;
	ecl::error ret;

    cl_uint height = 64;

    /* Calculate width and height from samples */
    samples = samples / 4;
    samples = (samples / GROUP_SIZE)? (samples / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE;

    cl_uint tempVar1 = (cl_uint)sqrt((double)samples);
    tempVar1 = (tempVar1 / GROUP_SIZE)? (tempVar1 / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE;
    samples = tempVar1 * tempVar1;

    width = tempVar1;
    height = width;
    ret = ecl::compileSource(code);
	assert(ret == eclSuccess);
    setParam<cl_uint>(&width, widthStr, widthDefault);

    // Alloc & init input data
    randArray = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    deviceCallPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    devicePutPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)];
    assert(randArray != NULL);
    assert(deviceCallPrice != NULL);
    assert(devicePutPrice != NULL);
    hostCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
    if(hostCallPrice == NULL)
        return 0;
    hostPutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
    if(hostPutPrice == NULL) {
        free(hostCallPrice);
        return 0;
    }

    // random initialisation of input
    for(cl_uint i = 0; i < width * height * 4; i++)
        randArray[i] = (float)rand() / (float)RAND_MAX;

    eclMemset(deviceCallPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(devicePutPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(hostCallPrice, 0, width * height * sizeof(cl_float4));
    eclMemset(hostPutPrice, 0, width * height * sizeof(cl_float4));

    // Call the kernel
    ecl::config globalSize(width, height);
    ecl::config localSize(blockSizeX, blockSizeY);
	ecl::config globalWorkOffset(0); 
    ecl::kernel kernel("blackScholes", ret);
    assert(ret == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
    ret = kernel.setArg(0, randArray);
	assert(ret == eclSuccess);
    ret = kernel.setArg(1, width);
	assert(ret == eclSuccess);
    ret = kernel.setArg(2, deviceCallPrice);
	assert(ret == eclSuccess);
    ret = kernel.setArg(3, devicePutPrice);
	assert(ret == eclSuccess);
    ret = kernel.callNDRange(globalSize, localSize, globalWorkOffset);
	assert(ret == eclSuccess);
#else
    ret = kernel(globalSize, localSize)(randArray, width, deviceCallPrice, devicePutPrice);
	assert(ret == eclSuccess);
#endif

    printf("deviceCallPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", deviceCallPrice[i]);
    }
    printf("\ndevicePutPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", devicePutPrice[i]);
    }

    blackScholesCPU(randArray, width, height, hostCallPrice, hostPutPrice);
    printf("\nhostCallPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", hostCallPrice[i]);
    }
    printf("\nhostPutPrice£º\n");
    for(cl_uint i = 0; i < width; i++) {
        printf("%f ", hostPutPrice[i]);
    }

    float error = 0.0f;
    float ref = 0.0f;
    bool callPriceResult = true;
    bool putPriceResult = true;
    float normRef;

    for(cl_uint i = 1; i < width * height * 4; ++i) {
        float diff = hostCallPrice[i] - deviceCallPrice[i];
        error += diff * diff;
        ref += hostCallPrice[i] * deviceCallPrice[i];
    }

    normRef =::sqrtf((float) ref);
    if (::fabs((float) ref) < 1e-7f) {
        callPriceResult = false;
    }
    if(callPriceResult) {
        float normError = ::sqrtf((float) error);
        error = normError / normRef;
        callPriceResult = error < 1e-6f;
    }

    for(cl_uint i = 1; i < width * height * 4; ++i) {
        float diff = hostPutPrice[i] - devicePutPrice[i];
        error += diff * diff;
        ref += hostPutPrice[i] * devicePutPrice[i];
    }

    normRef =::sqrtf((float) ref);
    if (::fabs((float) ref) < 1e-7f) {
        putPriceResult = false;
    }
    if(putPriceResult) {
        float normError = ::sqrtf((float) error);
        error = normError / normRef;
        putPriceResult = error < 1e-4f;
    }

    if(!(callPriceResult ? (putPriceResult ? true : false) : false)) {
        printf("Failed!\n");
    } else {
        printf("Passed!\n");
    }

    free(hostPutPrice);
    hostPutPrice = NULL;
    free(hostCallPrice);
    hostCallPrice = NULL;

    ecl::free(devicePutPrice);
    ecl::free(deviceCallPrice);
    ecl::free(randArray);

    return 0;
}
Esempio n. 5
0
int memcpyTest(MemcpyType type, bool callKernel, void *(*memcpy_fn)(void *, const void *, size_t n))
{
	int error = 0;

	ecl::config globalSize (1);
	ecl::config localSize (1);

	ecl::error ret;
	ecl::kernel kernel("null", ret);
	assert(ret == eclSuccess);

	uint8_t *baseSrc = NULL;
	uint8_t *eclSrc = NULL;
	uint8_t *eclDst = NULL;

	baseSrc = (uint8_t *)malloc(maxCount);
	init(baseSrc, int(maxCount), 0xca);
	for (size_t count = minCount; count <= maxCount; count *= 2) {
		fprintf(stderr, "ALLOC: "FMT_SIZE"\n", count);

		if (type == GMAC_TO_GMAC) {
			assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess);
			assert(ecl::malloc((void **)&eclDst, count) == eclSuccess);
		} else if (type == HOST_TO_GMAC) {
			eclSrc = (uint8_t *)malloc(count);
			assert(ecl::malloc((void **)&eclDst, count) == eclSuccess);
		} else if (type == GMAC_TO_HOST) {
			assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess);
			eclDst = (uint8_t *)malloc(count);
		}

		for (size_t stride = 0, i = 1; stride < count/3; stride = i, i =  i * 2 - (i == 1? 0: 1)) {
			for (size_t copyCount = 1; copyCount < count/3; copyCount *= 2) {
				init(eclSrc + stride, int(copyCount), 0xca);
				if (stride == 0) {
					init(eclDst + stride, int(copyCount) + 1, 0);
				} else {
					init(eclDst + stride - 1, int(copyCount) + 2, 0);
				}
				assert(stride + copyCount <= count);

				if (callKernel) {
					ret = kernel.callNDRange(globalSize, localSize);
					assert(ret == eclSuccess);
				}
				memcpy_fn(eclDst + stride, eclSrc + stride, copyCount);

				int ret = memcmp(eclDst + stride, baseSrc + stride, copyCount);
				if (stride == 0) {
					ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0);
				} else {
					ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0);
				}

				if (ret != 0) {
#if 0
					fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd\n",
						count    ,
						stride   ,
						copyCount);
#endif
					abort();
					error = 1;
					goto exit_test;
				}
#if 0
				for (unsigned k = 0; k < count; k++) {
					int ret = baseDst[k] != eclDst[k];
					if (ret != 0) {
						fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd. Pos %u\n", count    ,
							stride   ,
							copyCount, k);
						error = 1;
					}
				}
#endif
			}
		}

		if (type == GMAC_TO_GMAC) {
			assert(ecl::free(eclSrc) == eclSuccess);
			assert(ecl::free(eclDst) == eclSuccess);
		} else if (type == HOST_TO_GMAC) {
			free(eclSrc);
			assert(ecl::free(eclDst) == eclSuccess);
		} else if (type == GMAC_TO_HOST) {
			assert(ecl::free(eclSrc) == eclSuccess);
			free(eclDst);
		}
	}
	free(baseSrc);

	return error;

exit_test:
	if (type == GMAC_TO_GMAC) {
		assert(ecl::free(eclSrc) == eclSuccess);
		assert(ecl::free(eclDst) == eclSuccess);
	} else if (type == HOST_TO_GMAC) {
		free(eclSrc);
		assert(ecl::free(eclDst) == eclSuccess);
	} else if (type == GMAC_TO_HOST) {
		assert(ecl::free(eclSrc) == eclSuccess);
		free(eclDst);
	}

	free(baseSrc);

	return error;
}
Esempio n. 6
0
void *dct_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("dct", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)  == eclSuccess);
	assert(k.setArg(3, height) == eclSuccess);


	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		s_dct.in = new (ecl::allocator) float[width * height];
		assert(s_dct.in != NULL);
		s_dct.out = new (ecl::allocator) float[width * height];
		assert(s_dct.out != NULL);
		getTime(&t);
		printTime(&s, &t, "DCT:Malloc: ", "\n");

		getTime(&s);
		__randInit(s_dct.in, width * height);
		getTime(&t);
		printTime(&s, &t, "DCT:Init: ", "\n");

		getTime(&s);
		assert(k.setArg(0, s_dct.out)    == eclSuccess);
		assert(k.setArg(1, s_dct.in)     == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "DCT:Run: ", "\n");

		getTime(&s);
		gmac_sem_wait(&s_quant.free, 1);
		s_quant.next_in = s_dct.out;
		s_quant.next_out = s_dct.in;
		ecl::deviceSendReceive(s_quant.id);
		getTime(&t);
		printTime(&s, &t, "DCT:SendRecv: ", "\n");
	}

	getTime(&s);
	s_dct.in = new (ecl::allocator) float[width * height];
	assert(s_dct.in != NULL);
	s_dct.out = new (ecl::allocator) float[width * height];
	assert(s_dct.out != NULL);
	getTime(&t);
	printTime(&s, &t, "DCT:Malloc: ", "\n");

	getTime(&s);
	gmac_sem_wait(&s_quant.free, 1);
	s_quant.next_in = s_dct.out;
	s_quant.next_out = s_dct.in;
	ecl::deviceSendReceive(s_quant.id);
	getTime(&t);
	printTime(&s, &t, "DCT:SendRecv: ", "\n");

	getTime(&s);
	s_dct.in = new (ecl::allocator) float[width * height];
	assert(s_dct.in != NULL);
	s_dct.out = new (ecl::allocator) float[width * height];
	assert(s_dct.out != NULL);
	getTime(&t);
	printTime(&s, &t, "DCT:Malloc: ", "\n");

	getTime(&s);
	gmac_sem_wait(&s_quant.free, 1);
	s_quant.next_in = s_dct.out;
	s_quant.next_out = s_dct.in;
	ecl::deviceSendReceive(s_quant.id);
	getTime(&t);
	printTime(&s, &t, "DCT:SendRecv: ", "\n");

	return NULL;
}
Esempio n. 7
0
void *idct_thread(void *args)
{
	gmactime_t s, t;

	barrier_wait(&barrierInit);

	getTime(&s);
	gmac_sem_post(&s_idct.free, 1);
	ecl::deviceSendReceive(s_dct.id);
	nextStage(&s_idct, NULL);
	getTime(&t);
	printTime(&s, &t, "IDCT:SendRecv: ", "\n");

	getTime(&s);
	gmac_sem_post(&s_idct.free, 1);
	ecl::deviceSendReceive(s_dct.id);
	getTime(&t);
	nextStage(&s_idct, NULL);
	getTime(&t);
	printTime(&s, &t, "IDCT:SendRecv: ", "\n");

	ecl::config localSize(blockSize, blockSize);
	ecl::config globalSize(width, height);
	if(width  % blockSize) globalSize.x += blockSize;
	if(height % blockSize) globalSize.y += blockSize;
	ecl::error err;
	ecl::kernel k("idct", err);
	assert(err == eclSuccess);

	assert(k.setArg(2, width)   == eclSuccess);
	assert(k.setArg(3, height)  == eclSuccess);

	for(unsigned i = 0; i < frames; i++) {
		getTime(&s);
		assert(k.setArg(0, s_idct.in)  == eclSuccess);
		assert(k.setArg(1, s_idct.out) == eclSuccess);
		assert(k.callNDRange(globalSize, localSize) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "IDCT:Run: ", "\n");

		getTime(&s);
		assert(ecl::free(s_idct.in) == eclSuccess);
		assert(ecl::free(s_idct.out) == eclSuccess);
		getTime(&t);
		printTime(&s, &t, "IDCT:Free: ", "\n");

		getTime(&s);
		ecl::deviceSendReceive(s_dct.id);
		nextStage(&s_idct, NULL);
		getTime(&t);
		printTime(&s, &t, "IDCT:SendRecv: ", "\n");
	}

	getTime(&s);
	ecl::free(s_idct.in);
	ecl::free(s_idct.out);
	getTime(&t);
	printTime(&s, &t, "IDCT:Free: ", "\n");

	return NULL;
}
Esempio n. 8
0
int main(int argc, char *argv[])
{
	float *a, *b, *c;
	gmactime_t s, t;
	ecl::error err;

	assert(ecl::compileSource(kernel) == eclSuccess);

	float * orig = (float *) malloc(vecSize * sizeof(float));
	std::ifstream o_file(VECTORC);
	o_file.read((char *)orig, vecSize * sizeof(float));
	o_file.close();

	getTime(&s);
	// Alloc & init input data
	assert(ecl::malloc((void **)&a, vecSize * sizeof(float)) == eclSuccess);
	assert(ecl::malloc((void **)&b, vecSize * sizeof(float)) == eclSuccess);
	assert(ecl::malloc((void **)&c, vecSize * sizeof(float)) == eclSuccess);
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");

	std::ifstream a_file(VECTORA);
	std::ifstream b_file(VECTORB);

	getTime(&s);
	a_file.read((char *)a, vecSize * sizeof(float));
	a_file.close();
	b_file.read((char *)b, vecSize * sizeof(float));
	b_file.close();
	getTime(&t);
	printTime(&s, &t, "Init: ", "\n");

	// Call the kernel
	getTime(&s);
	ecl::config localSize (blockSize);
	ecl::config globalSize (vecSize / blockSize);
	if(vecSize % blockSize) globalSize.x++;
	globalSize.x *= localSize.x;

	ecl::kernel kernel("vecAdd", err);
	assert(err == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
	err = kernel.setArg(0, c);
	assert(err == eclSuccess);
	err = kernel.setArg(1, a);
	assert(err == eclSuccess);
	err = kernel.setArg(2, b);
	assert(err == eclSuccess);
	err = kernel.setArg(3, vecSize);
	assert(err == eclSuccess);
	err = kernel.callNDRange(globalSize, localSize);
	assert(err == eclSuccess);
#else
	assert(kernel(c, a, b, vecSize)(globalSize, localSize) == eclSuccess);
#endif
	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");

	getTime(&s);
	float error = 0.f;
	for(unsigned i = 0; i < vecSize; i++) {
		error += orig[i] - (c[i]);
	}
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");

	getTime(&s);
	std::ofstream c_file("vectorC_shared");
	c_file.write((char *)c, vecSize * sizeof(float));
	c_file.close();
	getTime(&t);
	printTime(&s, &t, "Write: ", "\n");

	getTime(&s);
	ecl::free(a);
	ecl::free(b);
	ecl::free(c);
	getTime(&t);
	printTime(&s, &t, "Free: ", "\n");

	return error != 0;
}
Esempio n. 9
0
int main(int argc, char *argv[]){
 
    std::string line;
    std::string kersource="";
    std::ifstream myfile ("Matmul.cl");
    if (myfile.is_open())
    {
        while ( getline (myfile,line) )
        {
            kersource=kersource+line;
            kersource=kersource+"\n";
        }
        myfile.close();
    }
  
    const char* kernelSource  = kersource.c_str();
    
    unsigned int n = 1000;
 
    // Host input vectors
    float *h_a;
    float *h_b;
    // Host output vector
    float *h_c;
 
    // Device input buffers
    cl::Buffer d_a;
    cl::Buffer d_b;
    // Device output buffer
    cl::Buffer d_c;
    cl::LocalSpaceArg d_bwrk;
 
    // Size, in bytes, of each vector
    size_t bytes = n*n*sizeof (float);
 
    // Allocate memory for each vector on host
    h_a = new float[n*n];
    h_b = new float[n*n];
    h_c = new float[n*n];
 
    // Initialize vectors on host
    for(int i = 0; i < n*n; i++ )
    {
        h_a[i] = 1;
        h_b[i] = 2;
    }
    cl::STRING_CLASS buildlog;
    cl::Program program_;
    std::vector<cl::Device> devices;

    cl_int err = CL_SUCCESS;
    try {
 
        // Query platforms
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        if (platforms.size() == 0) {
            std::cout << "Platform size 0\n";
            return -1;
        }
        

 
        // Get list of devices on default platform and create context
        cl_context_properties properties[] =
           { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
        cl::Context context(CL_DEVICE_TYPE_CPU, properties);
         devices = context.getInfo<CL_CONTEXT_DEVICES>();


 
        // Create command queue for first device
        cl::CommandQueue queue(context, devices[0], 0, &err);
 
        // Create device memory buffers
        d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes);
        d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes);
        d_bwrk = cl::Local(n*sizeof(float));
 
        // Bind memory buffers
        queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a);
        queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b);
 
        //Build kernel from source string
        cl::Program::Sources source(1,
            std::make_pair(kernelSource,strlen(kernelSource)));
        program_ = cl::Program(context, source);
        program_.build(devices);

        
        

        std::cout<<"BuildLog: \n"<<buildlog;
        

        // Create kernel object
        cl::Kernel kernel(program_, "multiMat", &err);
 
        // Bind kernel arguments to kernel
        kernel.setArg(0, d_a);
        kernel.setArg(1, d_b);
        kernel.setArg(2, d_c);
        kernel.setArg(3, n);
        kernel.setArg(4,d_bwrk);
 
        // Number of work items in each local work group
        cl::NDRange localSize(64);
        // Number of total work items - localSize must be devisor
        cl::NDRange globalSize((int)(ceil(n/(float)64)*64));
 
        // Enqueue kernel
        cl::Event event;
        queue.enqueueNDRangeKernel(
            kernel,
            cl::NullRange,
            globalSize,
            localSize,
            NULL,
            &event);
 
        // Block until kernel completion
        event.wait();
 
        // Read back d_c
        queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c);
        }
    catch (cl::Error err) 
    {
        std::cerr
        << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl;

        buildlog = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0], NULL);
        std::ofstream logfile ("Matmullog.txt");
        logfile<<buildlog;
        logfile.close();
        
    }
    std::cout<<"Global Size side :"<< (int)(ceil(n/(float)64)*64)<<"\n";
 
    // Sum up vector c and print result divided by n, this should equal 1 within error
    float sum = 0;
    for(int i=0; i<n*n; i++)
        sum += h_c[i];
    std::cout<<"final result: "<<sum<<std::endl;

    //std::ofstream outfile ("MatmulAns.txt");
    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++)
        {
    //        outfile<<h_c[i*n+j]<<" ";
        }
    //    outfile<<"\n";
    }
    //outfile.close();
    // Release host memory
    delete(h_a);
    delete(h_b);
    delete(h_c);
 
    return 0;
}
void MarginalizationInfo::marginalize()
{
    int pos = 0;
    for (auto &it : parameter_block_idx)
    {
        it.second = pos;
        pos += localSize(parameter_block_size[it.first]);
    }

    m = pos;

    for (const auto &it : parameter_block_size)
    {
        if (parameter_block_idx.find(it.first) == parameter_block_idx.end())
        {
            parameter_block_idx[it.first] = pos;
            pos += localSize(it.second);
        }
    }

    n = pos - m;

    //ROS_DEBUG("marginalization, pos: %d, m: %d, n: %d, size: %d", pos, m, n, (int)parameter_block_idx.size());

    TicToc t_summing;
    Eigen::MatrixXd A(pos, pos);
    Eigen::VectorXd b(pos);
    A.setZero();
    b.setZero();
    /*
    for (auto it : factors)
    {
        for (int i = 0; i < static_cast<int>(it->parameter_blocks.size()); i++)
        {
            int idx_i = parameter_block_idx[reinterpret_cast<long>(it->parameter_blocks[i])];
            int size_i = localSize(parameter_block_size[reinterpret_cast<long>(it->parameter_blocks[i])]);
            Eigen::MatrixXd jacobian_i = it->jacobians[i].leftCols(size_i);
            for (int j = i; j < static_cast<int>(it->parameter_blocks.size()); j++)
            {
                int idx_j = parameter_block_idx[reinterpret_cast<long>(it->parameter_blocks[j])];
                int size_j = localSize(parameter_block_size[reinterpret_cast<long>(it->parameter_blocks[j])]);
                Eigen::MatrixXd jacobian_j = it->jacobians[j].leftCols(size_j);
                if (i == j)
                    A.block(idx_i, idx_j, size_i, size_j) += jacobian_i.transpose() * jacobian_j;
                else
                {
                    A.block(idx_i, idx_j, size_i, size_j) += jacobian_i.transpose() * jacobian_j;
                    A.block(idx_j, idx_i, size_j, size_i) = A.block(idx_i, idx_j, size_i, size_j).transpose();
                }
            }
            b.segment(idx_i, size_i) += jacobian_i.transpose() * it->residuals;
        }
    }
    ROS_INFO("summing up costs %f ms", t_summing.toc());
    */
    //multi thread


    TicToc t_thread_summing;
    pthread_t tids[NUM_THREADS];
    ThreadsStruct threadsstruct[NUM_THREADS];
    int i = 0;
    for (auto it : factors)
    {
        threadsstruct[i].sub_factors.push_back(it);
        i++;
        i = i % NUM_THREADS;
    }
    for (int i = 0; i < NUM_THREADS; i++)
    {
        TicToc zero_matrix;
        threadsstruct[i].A = Eigen::MatrixXd::Zero(pos,pos);
        threadsstruct[i].b = Eigen::VectorXd::Zero(pos);
        threadsstruct[i].parameter_block_size = parameter_block_size;
        threadsstruct[i].parameter_block_idx = parameter_block_idx;
        int ret = pthread_create( &tids[i], NULL, ThreadsConstructA ,(void*)&(threadsstruct[i]));
        if (ret != 0)
        {
            ROS_WARN("pthread_create error");
            ROS_BREAK();
        }
    }
    for( int i = NUM_THREADS - 1; i >= 0; i--)  
    {
        pthread_join( tids[i], NULL ); 
        A += threadsstruct[i].A;
        b += threadsstruct[i].b;
    }
    //ROS_DEBUG("thread summing up costs %f ms", t_thread_summing.toc());
    //ROS_INFO("A diff %f , b diff %f ", (A - tmp_A).sum(), (b - tmp_b).sum());


    //TODO
    Eigen::MatrixXd Amm = 0.5 * (A.block(0, 0, m, m) + A.block(0, 0, m, m).transpose());
    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> saes(Amm);

    //ROS_ASSERT_MSG(saes.eigenvalues().minCoeff() >= -1e-4, "min eigenvalue %f", saes.eigenvalues().minCoeff());

    Eigen::MatrixXd Amm_inv = saes.eigenvectors() * Eigen::VectorXd((saes.eigenvalues().array() > eps).select(saes.eigenvalues().array().inverse(), 0)).asDiagonal() * saes.eigenvectors().transpose();
    //printf("error1: %f\n", (Amm * Amm_inv - Eigen::MatrixXd::Identity(m, m)).sum());

    Eigen::VectorXd bmm = b.segment(0, m);
    Eigen::MatrixXd Amr = A.block(0, m, m, n);
    Eigen::MatrixXd Arm = A.block(m, 0, n, m);
    Eigen::MatrixXd Arr = A.block(m, m, n, n);
    Eigen::VectorXd brr = b.segment(m, n);
    A = Arr - Arm * Amm_inv * Amr;
    b = brr - Arm * Amm_inv * bmm;

    Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> saes2(A);
    Eigen::VectorXd S = Eigen::VectorXd((saes2.eigenvalues().array() > eps).select(saes2.eigenvalues().array(), 0));
    Eigen::VectorXd S_inv = Eigen::VectorXd((saes2.eigenvalues().array() > eps).select(saes2.eigenvalues().array().inverse(), 0));

    Eigen::VectorXd S_sqrt = S.cwiseSqrt();
    Eigen::VectorXd S_inv_sqrt = S_inv.cwiseSqrt();

    linearized_jacobians = S_sqrt.asDiagonal() * saes2.eigenvectors().transpose();
    linearized_residuals = S_inv_sqrt.asDiagonal() * saes2.eigenvectors().transpose() * b;
    //std::cout << A << std::endl
    //          << std::endl;
    //std::cout << linearized_jacobians << std::endl;
    //printf("error2: %f %f\n", (linearized_jacobians.transpose() * linearized_jacobians - A).sum(),
    //      (linearized_jacobians.transpose() * linearized_residuals - b).sum());
}
Esempio n. 11
0
int main(int argc, char *argv[])
{
	gmactime_t s, t, S, T; 

	cl_float* randArray = NULL;
	cl_float* output = NULL;
	cl_float* refOutput;
	cl_int numSamples = 64;
	getTime(&S);
	getTime(&s);
	assert(ecl::compileSource(code) == eclSuccess);
	setParam<cl_int>(&numSteps, numStepsStr, numStepsDefault);
	// Alloc & init data
	randArray = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)];
	output = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)];
	assert(randArray != NULL);
	assert(output != NULL);
	refOutput = (float*)malloc(numSamples * sizeof(cl_float4));
	if(refOutput == NULL)
		return 0;
	getTime(&t);
	printTime(&s, &t, "Alloc: ", "\n");
	getTime(&s);
	/* random initialisation of input */
	for(int i = 0; i < numSamples * 4; i++) {
		randArray[i] = (float)rand() / (float)RAND_MAX;
	}
	valueInit(output, 0, numSamples * 4);
	getTime(&t);
	printTime(&s, &t, "Init: ", "\n");
	getTime(&s);
	ecl::config globalSize(numSamples * (numSteps + 1));
	ecl::config localSize(numSteps + 1);

	ecl::error err;
	ecl::kernel kernel("binomial_options", err);
	assert(err == eclSuccess);
#ifndef __GXX_EXPERIMENTAL_CXX0X__
	assert(kernel.setArg(0, numSteps) == eclSuccess);
	assert(kernel.setArg(1, randArray) == eclSuccess);
	assert(kernel.setArg(2, output) == eclSuccess);
	assert(kernel.setArg(3, (cl_float4 *)NULL) == eclSuccess);
	assert(kernel.setArg(4, (cl_float4 *)NULL) == eclSuccess);
	assert(kernel.callNDRange(globalSize, localSize) == eclSuccess);
#else
	assert(kernel(globalSize, localSize)(numSteps, randArray, output, NULL, NULL) == eclSuccess);
#endif
	getTime(&t);
	printTime(&s, &t, "Run: ", "\n");
	printf("Output: ");
	for(int i = 0; i < numSamples; i++) {
		printf("%f ", output[i]);
	}

	getTime(&s);
	bool result = 1;
	binomialOptionCPUReference(refOutput, randArray, numSamples, numSteps);
	float error = 0.0f;
	float ref = 0.0f;

	for(int i = 1; i < numSamples; ++i) {
		float diff = output[i] - refOutput[i];
		error += diff * diff;
		ref += output[i] * output[i];
	}

	float normRef =::sqrtf((float) ref);
	if (::fabs((float) ref) < 1e-7f) {
		result = 0;
	}
	if(result) {
		float normError = ::sqrtf((float) error);
		error = normError / normRef;
		result = error < 0.001f;
	}
	if(result)
		printf("\nPassed!\n");
	else
		printf("\nFailed!\n");
	getTime(&t);
	printTime(&s, &t, "Check: ", "\n");
	getTime(&T);
	printTime(&S, &T, "Total: ", "\n");
	getTime(&s);
	free(refOutput);
	refOutput = NULL;
	ecl::free(randArray);
	ecl::free(output);
	getTime(&t);
	printTime(&s, &t, "Free: ", "\n");
	return 0;
}