void operator()(ThreadParams& params, const std::string& name, T_Scalar value, const std::string& attrName = "", T_Attribute attribute = T_Attribute()) { log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name; // Size over all processes Dimensions globalSize(1, 1, 1); // Offset for this process Dimensions localOffset(0, 0, 0); // Offset for all processes Dimensions globalOffset(0, 0, 0); for (uint32_t d = 0; d < simDim; ++d) { globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d]; localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d]; } Dimensions localSize(1, 1, 1); // avoid deadlock between not finished pmacc tasks and mpi calls in adios __getTransactionEvent().waitForFinished(); typename traits::PICToSplash<T_Scalar>::type splashType; params.dataCollector->writeDomain(params.currentStep, /* id == time step */ globalSize, /* total size of dataset over all processes */ localOffset, /* write offset for this process */ splashType, /* data type */ simDim, /* NDims spatial dimensionality of the field */ splash::Selection(localSize), /* data size of this process */ name.c_str(), /* data set name */ splash::Domain( globalOffset, /* offset of the global domain */ globalSize /* size of the global domain */ ), DomainCollector::GridType, &value); if(!attrName.empty()) { /*simulation attribute for data*/ typename traits::PICToSplash<T_Attribute>::type attType; log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name; params.dataCollector->writeAttribute(params.currentStep, attType, name.c_str(), attrName.c_str(), &attribute); } }
void *quant_thread(void *args) { gmactime_t s, t; barrier_wait(&barrierInit); getTime(&s); gmac_sem_post(&s_quant.free, 1); nextStage(&s_quant, &s_idct); getTime(&t); printTime(&s, &t, "Quant:SendRecv: ", "\n"); ecl::config localSize(blockSize, blockSize); ecl::config globalSize(width, height); if(width % blockSize) globalSize.x += blockSize; if(height % blockSize) globalSize.y += blockSize; ecl::error err; ecl::kernel k("quant", err); assert(err == eclSuccess); assert(k.setArg(2, width) == eclSuccess); assert(k.setArg(3, height) == eclSuccess); assert(k.setArg(4, float(1e-6)) == eclSuccess); for(unsigned i = 0; i < frames; i++) { getTime(&s); assert(k.setArg(0, s_quant.in) == eclSuccess); assert(k.setArg(1, s_quant.out) == eclSuccess); assert(k.callNDRange(globalSize, localSize) == eclSuccess); getTime(&t); printTime(&s, &t, "Quant:Run: ", "\n"); getTime(&s); nextStage(&s_quant, &s_idct); getTime(&t); printTime(&s, &t, "Quant:SendRecv: ", "\n"); } // Move one stage the pipeline stages the pipeline getTime(&s); nextStage(&s_quant, &s_idct); getTime(&t); printTime(&s, &t, "Quant:SendRecv: ", "\n"); return NULL; }
void *addVector(void *ptr) { float *a, *b; float **c = (float **)ptr; gmactime_t s, t; ecl::error ret; getTime(&s); // Alloc & init input data ret = ecl::malloc((void **)&a, vecSize * sizeof(float)); assert(ret == eclSuccess); ret = ecl::malloc((void **)&b, vecSize * sizeof(float)); assert(ret == eclSuccess); for(unsigned i = 0; i < vecSize; i++) { a[i] = 1.f * rand() / RAND_MAX; b[i] = 1.f * rand() / RAND_MAX; } // Alloc output data ret = ecl::malloc((void **)c, vecSize * sizeof(float)); assert(ret == eclSuccess); getTime(&t); printTime(&s, &t, "Alloc: ", "\n"); // Call the kernel getTime(&s); ecl::config localSize(blockSize); ecl::config globalSize(vecSize / blockSize); if(vecSize % blockSize) globalSize.x++; globalSize.x *= localSize.x; ecl::kernel kernel("vecAdd", ret); assert(ret == eclSuccess); ret = kernel.setArg(0, *c); assert(ret == eclSuccess); ret = kernel.setArg(1, a); assert(ret == eclSuccess); ret = kernel.setArg(2, b); assert(ret == eclSuccess); ret = kernel.setArg(3, vecSize); assert(ret == eclSuccess); ret = kernel.callNDRange(globalSize, localSize); assert(ret == eclSuccess); getTime(&t); printTime(&s, &t, "Run: ", "\n"); getTime(&s); float error = 0; for(unsigned i = 0; i < vecSize; i++) { error += (*c)[i] - (a[i] + b[i]); } getTime(&t); printTime(&s, &t, "Check: ", "\n"); fprintf(stdout, "Error: %.02f\n", error); ecl::free(a); ecl::free(b); ecl::free(*c); return NULL; }
int main(int argc, char *argv[]) { cl_uint samples = 256 * 256 * 4; size_t blockSizeX = 1; size_t blockSizeY = 1; cl_float *randArray = NULL; cl_float *deviceCallPrice = NULL; cl_float *devicePutPrice = NULL; cl_float *hostCallPrice = NULL; cl_float *hostPutPrice = NULL; ecl::error ret; cl_uint height = 64; /* Calculate width and height from samples */ samples = samples / 4; samples = (samples / GROUP_SIZE)? (samples / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE; cl_uint tempVar1 = (cl_uint)sqrt((double)samples); tempVar1 = (tempVar1 / GROUP_SIZE)? (tempVar1 / GROUP_SIZE) * GROUP_SIZE: GROUP_SIZE; samples = tempVar1 * tempVar1; width = tempVar1; height = width; ret = ecl::compileSource(code); assert(ret == eclSuccess); setParam<cl_uint>(&width, widthStr, widthDefault); // Alloc & init input data randArray = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)]; deviceCallPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)]; devicePutPrice = new (ecl::allocator) cl_float[width * height * sizeof(cl_float4)]; assert(randArray != NULL); assert(deviceCallPrice != NULL); assert(devicePutPrice != NULL); hostCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4)); if(hostCallPrice == NULL) return 0; hostPutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4)); if(hostPutPrice == NULL) { free(hostCallPrice); return 0; } // random initialisation of input for(cl_uint i = 0; i < width * height * 4; i++) randArray[i] = (float)rand() / (float)RAND_MAX; eclMemset(deviceCallPrice, 0, width * height * sizeof(cl_float4)); eclMemset(devicePutPrice, 0, width * height * sizeof(cl_float4)); eclMemset(hostCallPrice, 0, width * height * sizeof(cl_float4)); eclMemset(hostPutPrice, 0, width * height * sizeof(cl_float4)); // Call the kernel ecl::config globalSize(width, height); ecl::config localSize(blockSizeX, blockSizeY); ecl::config globalWorkOffset(0); ecl::kernel kernel("blackScholes", ret); assert(ret == eclSuccess); #ifndef __GXX_EXPERIMENTAL_CXX0X__ ret = kernel.setArg(0, randArray); assert(ret == eclSuccess); ret = kernel.setArg(1, width); assert(ret == eclSuccess); ret = kernel.setArg(2, deviceCallPrice); assert(ret == eclSuccess); ret = kernel.setArg(3, devicePutPrice); assert(ret == eclSuccess); ret = kernel.callNDRange(globalSize, localSize, globalWorkOffset); assert(ret == eclSuccess); #else ret = kernel(globalSize, localSize)(randArray, width, deviceCallPrice, devicePutPrice); assert(ret == eclSuccess); #endif printf("deviceCallPrice£º\n"); for(cl_uint i = 0; i < width; i++) { printf("%f ", deviceCallPrice[i]); } printf("\ndevicePutPrice£º\n"); for(cl_uint i = 0; i < width; i++) { printf("%f ", devicePutPrice[i]); } blackScholesCPU(randArray, width, height, hostCallPrice, hostPutPrice); printf("\nhostCallPrice£º\n"); for(cl_uint i = 0; i < width; i++) { printf("%f ", hostCallPrice[i]); } printf("\nhostPutPrice£º\n"); for(cl_uint i = 0; i < width; i++) { printf("%f ", hostPutPrice[i]); } float error = 0.0f; float ref = 0.0f; bool callPriceResult = true; bool putPriceResult = true; float normRef; for(cl_uint i = 1; i < width * height * 4; ++i) { float diff = hostCallPrice[i] - deviceCallPrice[i]; error += diff * diff; ref += hostCallPrice[i] * deviceCallPrice[i]; } normRef =::sqrtf((float) ref); if (::fabs((float) ref) < 1e-7f) { callPriceResult = false; } if(callPriceResult) { float normError = ::sqrtf((float) error); error = normError / normRef; callPriceResult = error < 1e-6f; } for(cl_uint i = 1; i < width * height * 4; ++i) { float diff = hostPutPrice[i] - devicePutPrice[i]; error += diff * diff; ref += hostPutPrice[i] * devicePutPrice[i]; } normRef =::sqrtf((float) ref); if (::fabs((float) ref) < 1e-7f) { putPriceResult = false; } if(putPriceResult) { float normError = ::sqrtf((float) error); error = normError / normRef; putPriceResult = error < 1e-4f; } if(!(callPriceResult ? (putPriceResult ? true : false) : false)) { printf("Failed!\n"); } else { printf("Passed!\n"); } free(hostPutPrice); hostPutPrice = NULL; free(hostCallPrice); hostCallPrice = NULL; ecl::free(devicePutPrice); ecl::free(deviceCallPrice); ecl::free(randArray); return 0; }
int memcpyTest(MemcpyType type, bool callKernel, void *(*memcpy_fn)(void *, const void *, size_t n)) { int error = 0; ecl::config globalSize (1); ecl::config localSize (1); ecl::error ret; ecl::kernel kernel("null", ret); assert(ret == eclSuccess); uint8_t *baseSrc = NULL; uint8_t *eclSrc = NULL; uint8_t *eclDst = NULL; baseSrc = (uint8_t *)malloc(maxCount); init(baseSrc, int(maxCount), 0xca); for (size_t count = minCount; count <= maxCount; count *= 2) { fprintf(stderr, "ALLOC: "FMT_SIZE"\n", count); if (type == GMAC_TO_GMAC) { assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess); assert(ecl::malloc((void **)&eclDst, count) == eclSuccess); } else if (type == HOST_TO_GMAC) { eclSrc = (uint8_t *)malloc(count); assert(ecl::malloc((void **)&eclDst, count) == eclSuccess); } else if (type == GMAC_TO_HOST) { assert(ecl::malloc((void **)&eclSrc, count) == eclSuccess); eclDst = (uint8_t *)malloc(count); } for (size_t stride = 0, i = 1; stride < count/3; stride = i, i = i * 2 - (i == 1? 0: 1)) { for (size_t copyCount = 1; copyCount < count/3; copyCount *= 2) { init(eclSrc + stride, int(copyCount), 0xca); if (stride == 0) { init(eclDst + stride, int(copyCount) + 1, 0); } else { init(eclDst + stride - 1, int(copyCount) + 2, 0); } assert(stride + copyCount <= count); if (callKernel) { ret = kernel.callNDRange(globalSize, localSize); assert(ret == eclSuccess); } memcpy_fn(eclDst + stride, eclSrc + stride, copyCount); int ret = memcmp(eclDst + stride, baseSrc + stride, copyCount); if (stride == 0) { ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0); } else { ret = ret && (eclDst[stride - 1] == 0 && eclDst[stride + copyCount] == 0); } if (ret != 0) { #if 0 fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd\n", count , stride , copyCount); #endif abort(); error = 1; goto exit_test; } #if 0 for (unsigned k = 0; k < count; k++) { int ret = baseDst[k] != eclDst[k]; if (ret != 0) { fprintf(stderr, "Error: eclToGmacTest size: %zd, stride: %zd, copy: %zd. Pos %u\n", count , stride , copyCount, k); error = 1; } } #endif } } if (type == GMAC_TO_GMAC) { assert(ecl::free(eclSrc) == eclSuccess); assert(ecl::free(eclDst) == eclSuccess); } else if (type == HOST_TO_GMAC) { free(eclSrc); assert(ecl::free(eclDst) == eclSuccess); } else if (type == GMAC_TO_HOST) { assert(ecl::free(eclSrc) == eclSuccess); free(eclDst); } } free(baseSrc); return error; exit_test: if (type == GMAC_TO_GMAC) { assert(ecl::free(eclSrc) == eclSuccess); assert(ecl::free(eclDst) == eclSuccess); } else if (type == HOST_TO_GMAC) { free(eclSrc); assert(ecl::free(eclDst) == eclSuccess); } else if (type == GMAC_TO_HOST) { assert(ecl::free(eclSrc) == eclSuccess); free(eclDst); } free(baseSrc); return error; }
void *dct_thread(void *args) { gmactime_t s, t; barrier_wait(&barrierInit); ecl::config localSize(blockSize, blockSize); ecl::config globalSize(width, height); if(width % blockSize) globalSize.x += blockSize; if(height % blockSize) globalSize.y += blockSize; ecl::error err; ecl::kernel k("dct", err); assert(err == eclSuccess); assert(k.setArg(2, width) == eclSuccess); assert(k.setArg(3, height) == eclSuccess); for(unsigned i = 0; i < frames; i++) { getTime(&s); s_dct.in = new (ecl::allocator) float[width * height]; assert(s_dct.in != NULL); s_dct.out = new (ecl::allocator) float[width * height]; assert(s_dct.out != NULL); getTime(&t); printTime(&s, &t, "DCT:Malloc: ", "\n"); getTime(&s); __randInit(s_dct.in, width * height); getTime(&t); printTime(&s, &t, "DCT:Init: ", "\n"); getTime(&s); assert(k.setArg(0, s_dct.out) == eclSuccess); assert(k.setArg(1, s_dct.in) == eclSuccess); assert(k.callNDRange(globalSize, localSize) == eclSuccess); getTime(&t); printTime(&s, &t, "DCT:Run: ", "\n"); getTime(&s); gmac_sem_wait(&s_quant.free, 1); s_quant.next_in = s_dct.out; s_quant.next_out = s_dct.in; ecl::deviceSendReceive(s_quant.id); getTime(&t); printTime(&s, &t, "DCT:SendRecv: ", "\n"); } getTime(&s); s_dct.in = new (ecl::allocator) float[width * height]; assert(s_dct.in != NULL); s_dct.out = new (ecl::allocator) float[width * height]; assert(s_dct.out != NULL); getTime(&t); printTime(&s, &t, "DCT:Malloc: ", "\n"); getTime(&s); gmac_sem_wait(&s_quant.free, 1); s_quant.next_in = s_dct.out; s_quant.next_out = s_dct.in; ecl::deviceSendReceive(s_quant.id); getTime(&t); printTime(&s, &t, "DCT:SendRecv: ", "\n"); getTime(&s); s_dct.in = new (ecl::allocator) float[width * height]; assert(s_dct.in != NULL); s_dct.out = new (ecl::allocator) float[width * height]; assert(s_dct.out != NULL); getTime(&t); printTime(&s, &t, "DCT:Malloc: ", "\n"); getTime(&s); gmac_sem_wait(&s_quant.free, 1); s_quant.next_in = s_dct.out; s_quant.next_out = s_dct.in; ecl::deviceSendReceive(s_quant.id); getTime(&t); printTime(&s, &t, "DCT:SendRecv: ", "\n"); return NULL; }
void *idct_thread(void *args) { gmactime_t s, t; barrier_wait(&barrierInit); getTime(&s); gmac_sem_post(&s_idct.free, 1); ecl::deviceSendReceive(s_dct.id); nextStage(&s_idct, NULL); getTime(&t); printTime(&s, &t, "IDCT:SendRecv: ", "\n"); getTime(&s); gmac_sem_post(&s_idct.free, 1); ecl::deviceSendReceive(s_dct.id); getTime(&t); nextStage(&s_idct, NULL); getTime(&t); printTime(&s, &t, "IDCT:SendRecv: ", "\n"); ecl::config localSize(blockSize, blockSize); ecl::config globalSize(width, height); if(width % blockSize) globalSize.x += blockSize; if(height % blockSize) globalSize.y += blockSize; ecl::error err; ecl::kernel k("idct", err); assert(err == eclSuccess); assert(k.setArg(2, width) == eclSuccess); assert(k.setArg(3, height) == eclSuccess); for(unsigned i = 0; i < frames; i++) { getTime(&s); assert(k.setArg(0, s_idct.in) == eclSuccess); assert(k.setArg(1, s_idct.out) == eclSuccess); assert(k.callNDRange(globalSize, localSize) == eclSuccess); getTime(&t); printTime(&s, &t, "IDCT:Run: ", "\n"); getTime(&s); assert(ecl::free(s_idct.in) == eclSuccess); assert(ecl::free(s_idct.out) == eclSuccess); getTime(&t); printTime(&s, &t, "IDCT:Free: ", "\n"); getTime(&s); ecl::deviceSendReceive(s_dct.id); nextStage(&s_idct, NULL); getTime(&t); printTime(&s, &t, "IDCT:SendRecv: ", "\n"); } getTime(&s); ecl::free(s_idct.in); ecl::free(s_idct.out); getTime(&t); printTime(&s, &t, "IDCT:Free: ", "\n"); return NULL; }
int main(int argc, char *argv[]) { float *a, *b, *c; gmactime_t s, t; ecl::error err; assert(ecl::compileSource(kernel) == eclSuccess); float * orig = (float *) malloc(vecSize * sizeof(float)); std::ifstream o_file(VECTORC); o_file.read((char *)orig, vecSize * sizeof(float)); o_file.close(); getTime(&s); // Alloc & init input data assert(ecl::malloc((void **)&a, vecSize * sizeof(float)) == eclSuccess); assert(ecl::malloc((void **)&b, vecSize * sizeof(float)) == eclSuccess); assert(ecl::malloc((void **)&c, vecSize * sizeof(float)) == eclSuccess); getTime(&t); printTime(&s, &t, "Alloc: ", "\n"); std::ifstream a_file(VECTORA); std::ifstream b_file(VECTORB); getTime(&s); a_file.read((char *)a, vecSize * sizeof(float)); a_file.close(); b_file.read((char *)b, vecSize * sizeof(float)); b_file.close(); getTime(&t); printTime(&s, &t, "Init: ", "\n"); // Call the kernel getTime(&s); ecl::config localSize (blockSize); ecl::config globalSize (vecSize / blockSize); if(vecSize % blockSize) globalSize.x++; globalSize.x *= localSize.x; ecl::kernel kernel("vecAdd", err); assert(err == eclSuccess); #ifndef __GXX_EXPERIMENTAL_CXX0X__ err = kernel.setArg(0, c); assert(err == eclSuccess); err = kernel.setArg(1, a); assert(err == eclSuccess); err = kernel.setArg(2, b); assert(err == eclSuccess); err = kernel.setArg(3, vecSize); assert(err == eclSuccess); err = kernel.callNDRange(globalSize, localSize); assert(err == eclSuccess); #else assert(kernel(c, a, b, vecSize)(globalSize, localSize) == eclSuccess); #endif getTime(&t); printTime(&s, &t, "Run: ", "\n"); getTime(&s); float error = 0.f; for(unsigned i = 0; i < vecSize; i++) { error += orig[i] - (c[i]); } getTime(&t); printTime(&s, &t, "Check: ", "\n"); getTime(&s); std::ofstream c_file("vectorC_shared"); c_file.write((char *)c, vecSize * sizeof(float)); c_file.close(); getTime(&t); printTime(&s, &t, "Write: ", "\n"); getTime(&s); ecl::free(a); ecl::free(b); ecl::free(c); getTime(&t); printTime(&s, &t, "Free: ", "\n"); return error != 0; }
int main(int argc, char *argv[]){ std::string line; std::string kersource=""; std::ifstream myfile ("Matmul.cl"); if (myfile.is_open()) { while ( getline (myfile,line) ) { kersource=kersource+line; kersource=kersource+"\n"; } myfile.close(); } const char* kernelSource = kersource.c_str(); unsigned int n = 1000; // Host input vectors float *h_a; float *h_b; // Host output vector float *h_c; // Device input buffers cl::Buffer d_a; cl::Buffer d_b; // Device output buffer cl::Buffer d_c; cl::LocalSpaceArg d_bwrk; // Size, in bytes, of each vector size_t bytes = n*n*sizeof (float); // Allocate memory for each vector on host h_a = new float[n*n]; h_b = new float[n*n]; h_c = new float[n*n]; // Initialize vectors on host for(int i = 0; i < n*n; i++ ) { h_a[i] = 1; h_b[i] = 2; } cl::STRING_CLASS buildlog; cl::Program program_; std::vector<cl::Device> devices; cl_int err = CL_SUCCESS; try { // Query platforms std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); if (platforms.size() == 0) { std::cout << "Platform size 0\n"; return -1; } // Get list of devices on default platform and create context cl_context_properties properties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; cl::Context context(CL_DEVICE_TYPE_CPU, properties); devices = context.getInfo<CL_CONTEXT_DEVICES>(); // Create command queue for first device cl::CommandQueue queue(context, devices[0], 0, &err); // Create device memory buffers d_a = cl::Buffer(context, CL_MEM_READ_ONLY, bytes); d_b = cl::Buffer(context, CL_MEM_READ_ONLY, bytes); d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, bytes); d_bwrk = cl::Local(n*sizeof(float)); // Bind memory buffers queue.enqueueWriteBuffer(d_a, CL_TRUE, 0, bytes, h_a); queue.enqueueWriteBuffer(d_b, CL_TRUE, 0, bytes, h_b); //Build kernel from source string cl::Program::Sources source(1, std::make_pair(kernelSource,strlen(kernelSource))); program_ = cl::Program(context, source); program_.build(devices); std::cout<<"BuildLog: \n"<<buildlog; // Create kernel object cl::Kernel kernel(program_, "multiMat", &err); // Bind kernel arguments to kernel kernel.setArg(0, d_a); kernel.setArg(1, d_b); kernel.setArg(2, d_c); kernel.setArg(3, n); kernel.setArg(4,d_bwrk); // Number of work items in each local work group cl::NDRange localSize(64); // Number of total work items - localSize must be devisor cl::NDRange globalSize((int)(ceil(n/(float)64)*64)); // Enqueue kernel cl::Event event; queue.enqueueNDRangeKernel( kernel, cl::NullRange, globalSize, localSize, NULL, &event); // Block until kernel completion event.wait(); // Read back d_c queue.enqueueReadBuffer(d_c, CL_TRUE, 0, bytes, h_c); } catch (cl::Error err) { std::cerr << "ERROR: "<<err.what()<<"("<<err.err()<<")"<<std::endl; buildlog = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(devices[0], NULL); std::ofstream logfile ("Matmullog.txt"); logfile<<buildlog; logfile.close(); } std::cout<<"Global Size side :"<< (int)(ceil(n/(float)64)*64)<<"\n"; // Sum up vector c and print result divided by n, this should equal 1 within error float sum = 0; for(int i=0; i<n*n; i++) sum += h_c[i]; std::cout<<"final result: "<<sum<<std::endl; //std::ofstream outfile ("MatmulAns.txt"); for(int i=0;i<n;i++) { for(int j=0;j<n;j++) { // outfile<<h_c[i*n+j]<<" "; } // outfile<<"\n"; } //outfile.close(); // Release host memory delete(h_a); delete(h_b); delete(h_c); return 0; }
void MarginalizationInfo::marginalize() { int pos = 0; for (auto &it : parameter_block_idx) { it.second = pos; pos += localSize(parameter_block_size[it.first]); } m = pos; for (const auto &it : parameter_block_size) { if (parameter_block_idx.find(it.first) == parameter_block_idx.end()) { parameter_block_idx[it.first] = pos; pos += localSize(it.second); } } n = pos - m; //ROS_DEBUG("marginalization, pos: %d, m: %d, n: %d, size: %d", pos, m, n, (int)parameter_block_idx.size()); TicToc t_summing; Eigen::MatrixXd A(pos, pos); Eigen::VectorXd b(pos); A.setZero(); b.setZero(); /* for (auto it : factors) { for (int i = 0; i < static_cast<int>(it->parameter_blocks.size()); i++) { int idx_i = parameter_block_idx[reinterpret_cast<long>(it->parameter_blocks[i])]; int size_i = localSize(parameter_block_size[reinterpret_cast<long>(it->parameter_blocks[i])]); Eigen::MatrixXd jacobian_i = it->jacobians[i].leftCols(size_i); for (int j = i; j < static_cast<int>(it->parameter_blocks.size()); j++) { int idx_j = parameter_block_idx[reinterpret_cast<long>(it->parameter_blocks[j])]; int size_j = localSize(parameter_block_size[reinterpret_cast<long>(it->parameter_blocks[j])]); Eigen::MatrixXd jacobian_j = it->jacobians[j].leftCols(size_j); if (i == j) A.block(idx_i, idx_j, size_i, size_j) += jacobian_i.transpose() * jacobian_j; else { A.block(idx_i, idx_j, size_i, size_j) += jacobian_i.transpose() * jacobian_j; A.block(idx_j, idx_i, size_j, size_i) = A.block(idx_i, idx_j, size_i, size_j).transpose(); } } b.segment(idx_i, size_i) += jacobian_i.transpose() * it->residuals; } } ROS_INFO("summing up costs %f ms", t_summing.toc()); */ //multi thread TicToc t_thread_summing; pthread_t tids[NUM_THREADS]; ThreadsStruct threadsstruct[NUM_THREADS]; int i = 0; for (auto it : factors) { threadsstruct[i].sub_factors.push_back(it); i++; i = i % NUM_THREADS; } for (int i = 0; i < NUM_THREADS; i++) { TicToc zero_matrix; threadsstruct[i].A = Eigen::MatrixXd::Zero(pos,pos); threadsstruct[i].b = Eigen::VectorXd::Zero(pos); threadsstruct[i].parameter_block_size = parameter_block_size; threadsstruct[i].parameter_block_idx = parameter_block_idx; int ret = pthread_create( &tids[i], NULL, ThreadsConstructA ,(void*)&(threadsstruct[i])); if (ret != 0) { ROS_WARN("pthread_create error"); ROS_BREAK(); } } for( int i = NUM_THREADS - 1; i >= 0; i--) { pthread_join( tids[i], NULL ); A += threadsstruct[i].A; b += threadsstruct[i].b; } //ROS_DEBUG("thread summing up costs %f ms", t_thread_summing.toc()); //ROS_INFO("A diff %f , b diff %f ", (A - tmp_A).sum(), (b - tmp_b).sum()); //TODO Eigen::MatrixXd Amm = 0.5 * (A.block(0, 0, m, m) + A.block(0, 0, m, m).transpose()); Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> saes(Amm); //ROS_ASSERT_MSG(saes.eigenvalues().minCoeff() >= -1e-4, "min eigenvalue %f", saes.eigenvalues().minCoeff()); Eigen::MatrixXd Amm_inv = saes.eigenvectors() * Eigen::VectorXd((saes.eigenvalues().array() > eps).select(saes.eigenvalues().array().inverse(), 0)).asDiagonal() * saes.eigenvectors().transpose(); //printf("error1: %f\n", (Amm * Amm_inv - Eigen::MatrixXd::Identity(m, m)).sum()); Eigen::VectorXd bmm = b.segment(0, m); Eigen::MatrixXd Amr = A.block(0, m, m, n); Eigen::MatrixXd Arm = A.block(m, 0, n, m); Eigen::MatrixXd Arr = A.block(m, m, n, n); Eigen::VectorXd brr = b.segment(m, n); A = Arr - Arm * Amm_inv * Amr; b = brr - Arm * Amm_inv * bmm; Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> saes2(A); Eigen::VectorXd S = Eigen::VectorXd((saes2.eigenvalues().array() > eps).select(saes2.eigenvalues().array(), 0)); Eigen::VectorXd S_inv = Eigen::VectorXd((saes2.eigenvalues().array() > eps).select(saes2.eigenvalues().array().inverse(), 0)); Eigen::VectorXd S_sqrt = S.cwiseSqrt(); Eigen::VectorXd S_inv_sqrt = S_inv.cwiseSqrt(); linearized_jacobians = S_sqrt.asDiagonal() * saes2.eigenvectors().transpose(); linearized_residuals = S_inv_sqrt.asDiagonal() * saes2.eigenvectors().transpose() * b; //std::cout << A << std::endl // << std::endl; //std::cout << linearized_jacobians << std::endl; //printf("error2: %f %f\n", (linearized_jacobians.transpose() * linearized_jacobians - A).sum(), // (linearized_jacobians.transpose() * linearized_residuals - b).sum()); }
int main(int argc, char *argv[]) { gmactime_t s, t, S, T; cl_float* randArray = NULL; cl_float* output = NULL; cl_float* refOutput; cl_int numSamples = 64; getTime(&S); getTime(&s); assert(ecl::compileSource(code) == eclSuccess); setParam<cl_int>(&numSteps, numStepsStr, numStepsDefault); // Alloc & init data randArray = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)]; output = new (ecl::allocator) cl_float[numSamples * sizeof(cl_float4)]; assert(randArray != NULL); assert(output != NULL); refOutput = (float*)malloc(numSamples * sizeof(cl_float4)); if(refOutput == NULL) return 0; getTime(&t); printTime(&s, &t, "Alloc: ", "\n"); getTime(&s); /* random initialisation of input */ for(int i = 0; i < numSamples * 4; i++) { randArray[i] = (float)rand() / (float)RAND_MAX; } valueInit(output, 0, numSamples * 4); getTime(&t); printTime(&s, &t, "Init: ", "\n"); getTime(&s); ecl::config globalSize(numSamples * (numSteps + 1)); ecl::config localSize(numSteps + 1); ecl::error err; ecl::kernel kernel("binomial_options", err); assert(err == eclSuccess); #ifndef __GXX_EXPERIMENTAL_CXX0X__ assert(kernel.setArg(0, numSteps) == eclSuccess); assert(kernel.setArg(1, randArray) == eclSuccess); assert(kernel.setArg(2, output) == eclSuccess); assert(kernel.setArg(3, (cl_float4 *)NULL) == eclSuccess); assert(kernel.setArg(4, (cl_float4 *)NULL) == eclSuccess); assert(kernel.callNDRange(globalSize, localSize) == eclSuccess); #else assert(kernel(globalSize, localSize)(numSteps, randArray, output, NULL, NULL) == eclSuccess); #endif getTime(&t); printTime(&s, &t, "Run: ", "\n"); printf("Output: "); for(int i = 0; i < numSamples; i++) { printf("%f ", output[i]); } getTime(&s); bool result = 1; binomialOptionCPUReference(refOutput, randArray, numSamples, numSteps); float error = 0.0f; float ref = 0.0f; for(int i = 1; i < numSamples; ++i) { float diff = output[i] - refOutput[i]; error += diff * diff; ref += output[i] * output[i]; } float normRef =::sqrtf((float) ref); if (::fabs((float) ref) < 1e-7f) { result = 0; } if(result) { float normError = ::sqrtf((float) error); error = normError / normRef; result = error < 0.001f; } if(result) printf("\nPassed!\n"); else printf("\nFailed!\n"); getTime(&t); printTime(&s, &t, "Check: ", "\n"); getTime(&T); printTime(&S, &T, "Total: ", "\n"); getTime(&s); free(refOutput); refOutput = NULL; ecl::free(randArray); ecl::free(output); getTime(&t); printTime(&s, &t, "Free: ", "\n"); return 0; }