void mean_first_launcher(Param out, Param owt, Param in, Param inWeight, const int threads_x, const uint groups_x, const uint groups_y) { bool input_weight = ((inWeight.info.dims[0] * inWeight.info.dims[1] * inWeight.info.dims[2] * inWeight.info.dims[3]) != 0); bool output_weight = (( owt.info.dims[0] * owt.info.dims[1] * owt.info.dims[2] * owt.info.dims[3]) != 0); std::string ref_name = std::string("mean_0_") + std::string(dtype_traits<Ti>::getName()) + std::string("_") + std::string(dtype_traits<Tw>::getName()) + std::string("_") + std::string(dtype_traits<To>::getName()) + std::string("_") + std::to_string(threads_x) + std::string("_") + std::to_string(input_weight) + std::string("_") + std::to_string(output_weight); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { Binary<To, af_add_t> mean; ToNumStr<To> toNumStr; ToNumStr<Tw> twNumStr; Transform<uint, Tw, af_add_t> transform_weight; std::ostringstream options; options << " -D Ti=" << dtype_traits<Ti>::getName() << " -D Tw=" << dtype_traits<Tw>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D DIMX=" << threads_x << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP << " -D init_To=" << toNumStr(mean.init()) << " -D init_Tw=" << twNumStr(transform_weight(0)) << " -D one_Tw=" << twNumStr(transform_weight(1)); if (input_weight) { options << " -D INPUT_WEIGHT"; } if (output_weight) { options << " -D OUTPUT_WEIGHT"; } if (std::is_same<Ti, double>::value || std::is_same<Ti, cdouble>::value || std::is_same<To, double>::value) { options << " -D USE_DOUBLE"; } const char *ker_strs[] = {mean_ops_cl, mean_first_cl}; const int ker_lens[] = {mean_ops_cl_len, mean_first_cl_len}; Program prog; buildProgram(prog, 2, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "mean_first_kernel"); addKernelToCache(device, ref_name, entry); } NDRange local(threads_x, THREADS_PER_GROUP / threads_x); NDRange global(groups_x * in.info.dims[2] * local[0], groups_y * in.info.dims[3] * local[1]); uint repeat = divup(in.info.dims[0], (local[0] * groups_x)); if (input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, groups_x, groups_y, repeat); } else if ( input_weight && !output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *inWeight.data, inWeight.info, groups_x, groups_y, repeat); } else if (!input_weight && output_weight) { auto meanOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, KParam, uint, uint, uint>(*entry.ker); meanOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *owt.data, owt.info, *in.data, in.info, groups_x, groups_y, repeat); } CL_DEBUG_FINISH(getQueue()); }
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca, cl_mem dB, size_t dB_offset, int lddb, int incb, cl_command_queue queue) { std::string refName = std::string("swapdblk_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {swapdblk_cl}; const int ker_lens[] = {swapdblk_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "swapdblk"); addKernelToCache(device, refName, entry); } int nblocks = n / nb; if (nblocks == 0) return; int info = 0; if (n < 0) { info = -1; } else if (nb < 1 || nb > 1024) { info = -2; } else if (ldda < (nblocks - 1) * nb * inca + nb) { info = -4; } else if (inca < 0) { info = -5; } else if (lddb < (nblocks - 1) * nb * incb + nb) { info = -7; } else if (incb < 0) { info = -8; } if (info != 0) { AF_ERROR("Invalid configuration", AF_ERR_INTERNAL); return; } NDRange local(nb); NDRange global(nblocks * nb); cl::Buffer dAObj(dA, true); cl::Buffer dBObj(dB, true); auto swapdOp = KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer, unsigned long long, int, int>(*entry.ker); cl::CommandQueue q(queue); swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca, dBObj, dB_offset, lddb, incb); }
/* Initializes all ES objects needed to run the test */ void TextureCubeMapArrayColorDepthAttachmentsTest::initTest() { const glw::GLchar* depth_calculation_code = DE_NULL; const glw::Functions& gl = m_context.getRenderContext().getFunctions(); /* Check if EXT_texture_cube_map_array extension is supported */ if (true != m_is_texture_cube_map_array_supported) { throw tcu::NotSupportedError(TEXTURE_CUBE_MAP_ARRAY_EXTENSION_NOT_SUPPORTED); } /* This test should only run if EXT_geometry_shader is supported */ if (true != m_is_geometry_shader_extension_supported) { throw tcu::NotSupportedError(GEOMETRY_SHADER_EXTENSION_NOT_SUPPORTED); } /* Generate and bind VAO */ gl.genVertexArrays(1, &m_vao_id); GLU_EXPECT_NO_ERROR(gl.getError(), "Could not generate vertex array object"); gl.bindVertexArray(m_vao_id); GLU_EXPECT_NO_ERROR(gl.getError(), "Error binding vertex array object!"); /* Create a framebuffer object */ gl.genFramebuffers(1, &m_framebuffer_object_id); GLU_EXPECT_NO_ERROR(gl.getError(), "genFramebuffers"); /* Determine which depth format can be used as a depth attachment without * making the FBO incomplete */ determineSupportedDepthFormat(); /* Decide which code snippet to use for depth value calculation */ switch (m_depth_internal_format) { case GL_DEPTH_COMPONENT16: { depth_calculation_code = "-1.0 + float(2 * layer) / float(0xffff)"; break; } case GL_DEPTH_COMPONENT24: { depth_calculation_code = "-1.0 + float(2 * layer) / float(0xffffff)"; break; } case GL_DEPTH_COMPONENT32F: { depth_calculation_code = "-1.0 + float(2 * layer) / 256.0"; break; } default: { TCU_FAIL("Unrecognized depth internal format"); } } /* switch (m_depth_internal_format) */ /* Create shader objects */ m_fragment_shader_id = gl.createShader(GL_FRAGMENT_SHADER); m_layered_geometry_shader_id = gl.createShader(m_glExtTokens.GEOMETRY_SHADER); m_non_layered_geometry_shader_id = gl.createShader(m_glExtTokens.GEOMETRY_SHADER); m_vertex_shader_id = gl.createShader(GL_VERTEX_SHADER); GLU_EXPECT_NO_ERROR(gl.getError(), "glCreateShader() call(s) failed."); /* Create program objects */ m_layered_program_id = gl.createProgram(); m_non_layered_program_id = gl.createProgram(); GLU_EXPECT_NO_ERROR(gl.getError(), "glCreateProgram() call(s) failed"); /* Build up an array of snippets making up bodies of two geometry shaders * we'll be using for the test. */ const glw::GLchar* const layered_geometry_shader_parts[] = { m_geometry_shader_code_preamble, m_geometry_shader_code_layered, " float depth = ", depth_calculation_code, m_geometry_shader_code_body }; const glw::GLchar* const non_layered_geometry_shader_parts[] = { m_geometry_shader_code_preamble, m_geometry_shader_code_non_layered, " float depth = ", depth_calculation_code, m_geometry_shader_code_body }; const glw::GLuint n_layered_geometry_shader_parts = sizeof(layered_geometry_shader_parts) / sizeof(layered_geometry_shader_parts[0]); const glw::GLuint n_non_layered_geometry_shader_parts = sizeof(non_layered_geometry_shader_parts) / sizeof(non_layered_geometry_shader_parts[0]); /* Build both programs */ if (!buildProgram(m_layered_program_id, m_fragment_shader_id, 1, &m_fragment_shader_code, m_layered_geometry_shader_id, n_layered_geometry_shader_parts, layered_geometry_shader_parts, m_vertex_shader_id, 1, &m_vertex_shader_code)) { TCU_FAIL("Could not build layered-case program object"); } if (!buildProgram(m_non_layered_program_id, m_fragment_shader_id, 1, &m_fragment_shader_code, m_non_layered_geometry_shader_id, n_non_layered_geometry_shader_parts, non_layered_geometry_shader_parts, m_vertex_shader_id, 1, &m_vertex_shader_code)) { TCU_FAIL("Could not build non-layered-case program object"); } /* Get location of "uni_layer" uniform */ m_non_layered_program_id_uni_layer_uniform_location = gl.getUniformLocation(m_non_layered_program_id, "uni_layer"); if ((-1 == m_non_layered_program_id_uni_layer_uniform_location) || (GL_NO_ERROR != gl.getError())) { TCU_FAIL("Could not retrieve location of uni_layer uniform for non-layered program"); } }
void init() { //std::string racineProjet = "C:/Users/etu/workspace/code/Rendu temps reel/"; std::string racineProjet = "C:/Users/etu/Documents/GitHub/Gamagora-Rendu_temps_reel-TP/"; //std::string racineProjet = "B:/Utilisateur/git/code/Gamagora-Rendu_temps_reel-TP/"; // Build our program and an empty VAO gs.programView = buildProgram((racineProjet+(std::string)"basic.vsl").c_str(), (racineProjet+(std::string)"basic.fsl").c_str()); Mesh m; m = ObjManager::loadFromOBJ(Vector3D(0,0,0), (racineProjet+(std::string)"monkey.obj").c_str()); nbVertex = m.nbface()+6; //nbface + quad "sol" float* data = (float*) malloc(nbVertex*4*sizeof(float)); float* dataNormal = (float*) malloc(nbVertex * 4 * sizeof(float)); std::vector<Vector3D> vertex = m.getvertex(); std::vector<int> face = m.getface(); std::vector<Vector3D> normals = m.getNormals(); std::vector<int> normalIds = m.getNormalIds(); int i=0; for(int j=0; j<face.size(); j++){ //set vertex data[i] = vertex[face[j]].x; data[i+1] = vertex[face[j]].y; data[i+2] = vertex[face[j]].z; data[i+3] = 1; dataNormal[i] = normals[normalIds[j]].x; dataNormal[i+1] = normals[normalIds[j]].y; dataNormal[i+2] = normals[normalIds[j]].z; dataNormal[i+3] = 1; i+=4; } //ajout du quad pour faire le sol ajoutSol(Vector3D(-15,-1,-15), Vector3D(15,-1,-15), Vector3D(15,-1,15), Vector3D(-15,-1,15), Vector3D(0,1,0), nbVertex*4, data, dataNormal); GLuint buffer; glGenBuffers(1, &buffer); glBindBuffer(GL_ARRAY_BUFFER, buffer); glBufferData(GL_ARRAY_BUFFER, nbVertex*4*4, data, GL_STATIC_DRAW); GLuint buffer2; glGenBuffers(1, &buffer2); glBindBuffer(GL_ARRAY_BUFFER, buffer2); glBufferData(GL_ARRAY_BUFFER, nbVertex*4*4, dataNormal, GL_STATIC_READ); glCreateVertexArrays(1, &gs.vao); glBindVertexArray(gs.vao); glBindBuffer(GL_ARRAY_BUFFER, buffer); glVertexAttribPointer(12, 4, GL_FLOAT, GL_FALSE, 0, 0); glEnableVertexAttribArray(12); glBindBuffer(GL_ARRAY_BUFFER, buffer2); glVertexAttribPointer(13, 4, GL_FLOAT, GL_FALSE, 0, 0); glEnableVertexAttribArray(13); glBindVertexArray(0); glEnable(GL_DEPTH_TEST); // create the depth texture glGenTextures(1, &gs.depthTexture); glBindTexture(GL_TEXTURE_2D, gs.depthTexture); glTexStorage2D(GL_TEXTURE_2D, 1, GL_DEPTH_COMPONENT32F, 640, 480); // Framebuffer glGenFramebuffers(1, &gs.fbo); glBindFramebuffer(GL_FRAMEBUFFER, gs.fbo); glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, gs.depthTexture, 0); assert(glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE); glBindFramebuffer(GL_FRAMEBUFFER, 0); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, gs.depthTexture); glBindVertexArray(0); free(data); free(dataNormal); }
void fast(unsigned* out_feat, Param &x_out, Param &y_out, Param &score_out, Param in, const float thr, const float feature_ratio, const unsigned edge) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fastProgs; static std::map<int, Kernel*> lfKernel; static std::map<int, Kernel*> nmKernel; static std::map<int, Kernel*> gfKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ARC_LENGTH=" << arc_length << " -D NONMAX=" << static_cast<unsigned>(nonmax); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fast_cl, fast_cl_len, options.str()); fastProgs[device] = new Program(prog); lfKernel[device] = new Kernel(*fastProgs[device], "locate_features"); nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts"); gfKernel[device] = new Kernel(*fastProgs[device], "get_features"); }); const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio); // Matrix containing scores for detected features, scores are stored in the // same coordinates as features, dimensions should be equal to in. cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float)); std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0); getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]); cl::Buffer *d_flags = d_score; if (nonmax) { d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T)); } const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X); const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y); // Locate features kernel sizes const NDRange local(FAST_THREADS_X, FAST_THREADS_Y); const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y); auto lfOp = make_kernel<Buffer, KParam, Buffer, const float, const unsigned, LocalSpaceArg> (*lfKernel[device]); lfOp(EnqueueArgs(getQueue(), global, local), *in.data, in.info, *d_score, thr, edge, cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T))); CL_DEBUG_FINISH(getQueue()); const int blk_nonmax_x = divup(in.info.dims[0], 64); const int blk_nonmax_y = divup(in.info.dims[1], 64); // Nonmax kernel sizes const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y); const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y); unsigned count_init = 0; cl::Buffer *d_total = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init); //size_t *global_nonmax_dims = global_nonmax(); size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned); cl::Buffer *d_counts = bufferAlloc(blocks_sz); cl::Buffer *d_offsets = bufferAlloc(blocks_sz); auto nmOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned> (*nmKernel[device]); nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge); CL_DEBUG_FINISH(getQueue()); unsigned total; getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total); total = total < max_feat ? total : max_feat; if (total > 0) { size_t out_sz = total * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); auto gfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned, const unsigned> (*gfKernel[device]); gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *x_out.data, *y_out.data, *score_out.data, *d_flags, *d_counts, *d_offsets, in.info, total, edge); CL_DEBUG_FINISH(getQueue()); } *out_feat = total; x_out.info.dims[0] = total; x_out.info.strides[0] = 1; y_out.info.dims[0] = total; y_out.info.strides[0] = 1; score_out.info.dims[0] = total; score_out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = total; y_out.info.dims[k] = 1; y_out.info.strides[k] = total; score_out.info.dims[k] = 1; score_out.info.strides[k] = total; } bufferFree(d_score); if (nonmax) bufferFree(d_flags); bufferFree(d_total); bufferFree(d_counts); bufferFree(d_offsets); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void approx1(Param out, const Param in, const Param pos, const float offGrid) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> approxProgs; static std::map<int, Kernel*> approxKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { ToNum<Ty> toNum; std::ostringstream options; options << " -D Ty=" << dtype_traits<Ty>::getName() << " -D Tp=" << dtype_traits<Tp>::getName() << " -D ZERO=" << toNum(scalar<Ty>(0)); if((af_dtype) dtype_traits<Ty>::af_type == c32 || (af_dtype) dtype_traits<Ty>::af_type == c64) { options << " -D CPLX=1"; } else { options << " -D CPLX=0"; } if (std::is_same<Ty, double>::value || std::is_same<Ty, cdouble>::value) { options << " -D USE_DOUBLE"; } switch(method) { case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST"; break; case AF_INTERP_LINEAR: options << " -D INTERP=LINEAR"; break; default: break; } Program prog; buildProgram(prog, approx1_cl, approx1_cl_len, options.str()); approxProgs[device] = new Program(prog); approxKernels[device] = new Kernel(*approxProgs[device], "approx1_kernel"); }); auto approx1Op = make_kernel<Buffer, const KParam, const Buffer, const KParam, const Buffer, const KParam, const float, const int> (*approxKernels[device]); NDRange local(THREADS, 1, 1); int blocksPerMat = divup(out.info.dims[0], local[0]); NDRange global(blocksPerMat * local[0] * out.info.dims[1], out.info.dims[2] * out.info.dims[3] * local[0], 1); approx1Op(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *pos.data, pos.info, offGrid, blocksPerMat); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void csrmm_nt(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; std::string ref_name = std::string("csrmm_nt_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmm_cl}; const int ker_lens[] = {csrmm_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmm_nt"); // FIXME: Change this after adding another kernel entry.ker[1] = Kernel(*entry.prog, "csrmm_nt"); addKernelToCache(device, ref_name, entry); } auto csrmm_nt_kernel = entry.ker[0]; auto csrmm_nt_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam, T, T, Buffer>(csrmm_nt_kernel); NDRange local(THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int N = rhs.info.dims[0]; int groups_x = divup(N, local[0]); int groups_y = divup(M, REPEAT); groups_y = std::min(groups_y, MAX_CSRMM_GROUPS); NDRange global(local[0] * groups_x, local[1] * groups_y); std::vector<int> count(groups_x); cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int)); getQueue().enqueueWriteBuffer( *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data()); csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data, rhs.info, alpha, beta, *counter); bufferFree(counter); }
void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, const Param filter) { try { int f0 = filter.info.dims[0]; int f1 = filter.info.dims[1]; std::string ref_name = std::string("conv2_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::string(dtype_traits<aT>::getName()) + std::string("_") + std::to_string(expand) + std::string("_") + std::to_string(f0) + std::string("_") + std::to_string(f1); int device = getActiveDeviceId(); kc_t::iterator idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (idx == kernelCaches[device].end()) { size_t LOC_SIZE = (THREADS_X+2*(f0-1))*(THREADS_Y+2*(f1-1)); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<aT>::getName() << " -D BASE_DIM="<< 2 /* hard constant specific to this convolution type */ << " -D FLEN0=" << f0 << " -D FLEN1=" << f1 << " -D EXPAND="<< expand << " -D C_SIZE="<< LOC_SIZE; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_cl, convolve_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "convolve"); kernelCaches[device][ref_name] = entry; } else { entry = idx->second; } auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, KParam, int, int, int, int, int, int >(*entry.ker); convOp(EnqueueArgs(getQueue(), param.global, param.local), *out.data, out.info, *signal.data, signal.info, *param.impulse, filter.info, param.nBBS0, param.nBBS1, param.o[1], param.o[2], param.s[1], param.s[2]); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
// setup int main(int argc, char *argv[]) { GLenum err = 0; /********************************************* * GLFW SETUP *********************************************/ err = glfwInit(); if (!err) { fputs("Failed to load the GLFW library", stderr); exit(EXIT_FAILURE); } /********************************************* * STATE SETUP (initialize gl context) *********************************************/ // must be setup before glew so that a valid openGL // context exists (created with the window) w_state = new WorldState(); c_state.init(*w_state); /********************************************* * GLEW SETUP *********************************************/ err = glewInit(); if (err != GLEW_OK) { fputs("Failed to initialize the GLEW library", stderr); exit(EXIT_FAILURE); } /********************************************* * STATE SETUP (construct render states) *********************************************/ // must be setup after glew so that GL array // objects exist r_state[0] = new RenderState(3); r_state[1] = new RenderState(3); /********************************************* * SHADER SETUP *********************************************/ // read default shaders from file GLuint shaderProgram[2] = {0}; GLuint shaders[2] = {0}; buildShader(GL_VERTEX_SHADER, "default.vs.glsl", shaders[0]); buildShader(GL_FRAGMENT_SHADER, "default.fs.glsl", shaders[1]); // create default shader program shaderProgram[0] = buildProgram(2, shaders); // bind shader program w_state->setProgram(0, shaderProgram[0]); w_state->useProgram(0); // setup the transform matrices and uniform variables w_state->loadTransforms(); w_state->loadLights(); w_state->loadMaterials(); /********************************************* * LOAD MESH *********************************************/ g_mesh = loadMeshFromFile(*r_state[0], "Mesh/arma.obj"); /********************************************* * SET GL STATE *********************************************/ glEnable(GL_DEPTH_TEST); /********************************************* * RENDER LOOP *********************************************/ glfwSetTime(0.0); while (!glfwWindowShouldClose(c_state.window)) display(); /********************************************* * CLEAN UP *********************************************/ delete g_mesh; glfwTerminate(); exit(EXIT_SUCCESS); }