Ejemplo n.º 1
0
void mean_first_launcher(Param out, Param owt,
        Param in, Param inWeight,
        const int threads_x,
        const uint groups_x,
        const uint groups_y)
{

    bool input_weight = ((inWeight.info.dims[0] *
                          inWeight.info.dims[1] *
                          inWeight.info.dims[2] *
                          inWeight.info.dims[3]) != 0);

    bool output_weight = (( owt.info.dims[0] *
                            owt.info.dims[1] *
                            owt.info.dims[2] *
                            owt.info.dims[3]) != 0);

    std::string ref_name =
        std::string("mean_0_") +
        std::string(dtype_traits<Ti>::getName()) +
        std::string("_") +
        std::string(dtype_traits<Tw>::getName()) +
        std::string("_") +
        std::string(dtype_traits<To>::getName()) +
        std::string("_") +
        std::to_string(threads_x) +
        std::string("_") +
        std::to_string(input_weight) +
        std::string("_") +
        std::to_string(output_weight);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog==0 && entry.ker==0) {

        Binary<To, af_add_t> mean;
        ToNumStr<To> toNumStr;
        ToNumStr<Tw> twNumStr;
        Transform<uint, Tw, af_add_t> transform_weight;

        std::ostringstream options;
        options << " -D Ti=" << dtype_traits<Ti>::getName()
            << " -D Tw=" << dtype_traits<Tw>::getName()
            << " -D To=" << dtype_traits<To>::getName()
            << " -D DIMX=" << threads_x
            << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP
            << " -D init_To=" << toNumStr(mean.init())
            << " -D init_Tw=" << twNumStr(transform_weight(0))
            << " -D one_Tw=" << twNumStr(transform_weight(1));

        if (input_weight) { options << " -D INPUT_WEIGHT"; }
        if (output_weight) { options << " -D OUTPUT_WEIGHT"; }

        if (std::is_same<Ti, double>::value ||
                std::is_same<Ti, cdouble>::value ||
                std::is_same<To, double>::value) {
            options << " -D USE_DOUBLE";
        }

        const char *ker_strs[] = {mean_ops_cl, mean_first_cl};
        const int   ker_lens[] = {mean_ops_cl_len, mean_first_cl_len};
        Program prog;
        buildProgram(prog, 2, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker = new Kernel(*entry.prog, "mean_first_kernel");

        addKernelToCache(device, ref_name, entry);
    }

    NDRange local(threads_x, THREADS_PER_GROUP / threads_x);
    NDRange global(groups_x * in.info.dims[2] * local[0],
            groups_y * in.info.dims[3] * local[1]);

    uint repeat = divup(in.info.dims[0], (local[0] * groups_x));

    if (input_weight && output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    } else if ( input_weight && !output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *in.data, in.info,
                *inWeight.data, inWeight.info,
                groups_x, groups_y, repeat);
    } else if (!input_weight &&  output_weight) {
        auto meanOp = KernelFunctor<
            Buffer, KParam,
            Buffer, KParam,
            Buffer, KParam,
            uint, uint, uint>(*entry.ker);
        meanOp(EnqueueArgs(getQueue(), global, local),
                *out.data, out.info,
                *owt.data, owt.info,
                *in.data, in.info,
                groups_x, groups_y, repeat);
    }

    CL_DEBUG_FINISH(getQueue());
}
Ejemplo n.º 2
0
void swapdblk(int n, int nb, cl_mem dA, size_t dA_offset, int ldda, int inca,
              cl_mem dB, size_t dB_offset, int lddb, int incb,
              cl_command_queue queue) {
    std::string refName =
        std::string("swapdblk_") + std::string(dtype_traits<T>::getName());

    int device       = getActiveDeviceId();
    kc_entry_t entry = kernelCache(device, refName);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;

        options << " -D T=" << dtype_traits<T>::getName();
        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value)
            options << " -D USE_DOUBLE";

        const char* ker_strs[] = {swapdblk_cl};
        const int ker_lens[]   = {swapdblk_cl_len};
        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog = new Program(prog);
        entry.ker  = new Kernel(*entry.prog, "swapdblk");

        addKernelToCache(device, refName, entry);
    }

    int nblocks = n / nb;

    if (nblocks == 0) return;

    int info = 0;
    if (n < 0) {
        info = -1;
    } else if (nb < 1 || nb > 1024) {
        info = -2;
    } else if (ldda < (nblocks - 1) * nb * inca + nb) {
        info = -4;
    } else if (inca < 0) {
        info = -5;
    } else if (lddb < (nblocks - 1) * nb * incb + nb) {
        info = -7;
    } else if (incb < 0) {
        info = -8;
    }

    if (info != 0) {
        AF_ERROR("Invalid configuration", AF_ERR_INTERNAL);
        return;
    }

    NDRange local(nb);
    NDRange global(nblocks * nb);

    cl::Buffer dAObj(dA, true);
    cl::Buffer dBObj(dB, true);

    auto swapdOp =
        KernelFunctor<int, Buffer, unsigned long long, int, int, Buffer,
                      unsigned long long, int, int>(*entry.ker);

    cl::CommandQueue q(queue);
    swapdOp(EnqueueArgs(q, global, local), nb, dAObj, dA_offset, ldda, inca,
            dBObj, dB_offset, lddb, incb);
}
/* Initializes all ES objects needed to run the test */
void TextureCubeMapArrayColorDepthAttachmentsTest::initTest()
{
	const glw::GLchar*	depth_calculation_code = DE_NULL;
	const glw::Functions& gl					 = m_context.getRenderContext().getFunctions();

	/* Check if EXT_texture_cube_map_array extension is supported */
	if (true != m_is_texture_cube_map_array_supported)
	{
		throw tcu::NotSupportedError(TEXTURE_CUBE_MAP_ARRAY_EXTENSION_NOT_SUPPORTED);
	}

	/* This test should only run if EXT_geometry_shader is supported */
	if (true != m_is_geometry_shader_extension_supported)
	{
		throw tcu::NotSupportedError(GEOMETRY_SHADER_EXTENSION_NOT_SUPPORTED);
	}

	/* Generate and bind VAO */
	gl.genVertexArrays(1, &m_vao_id);
	GLU_EXPECT_NO_ERROR(gl.getError(), "Could not generate vertex array object");

	gl.bindVertexArray(m_vao_id);
	GLU_EXPECT_NO_ERROR(gl.getError(), "Error binding vertex array object!");

	/* Create a framebuffer object */
	gl.genFramebuffers(1, &m_framebuffer_object_id);
	GLU_EXPECT_NO_ERROR(gl.getError(), "genFramebuffers");

	/* Determine which depth format can be used as a depth attachment without
	 * making the FBO incomplete */
	determineSupportedDepthFormat();

	/* Decide which code snippet to use for depth value calculation */
	switch (m_depth_internal_format)
	{
	case GL_DEPTH_COMPONENT16:
	{
		depth_calculation_code = "-1.0 + float(2 * layer) / float(0xffff)";

		break;
	}

	case GL_DEPTH_COMPONENT24:
	{
		depth_calculation_code = "-1.0 + float(2 * layer) / float(0xffffff)";

		break;
	}

	case GL_DEPTH_COMPONENT32F:
	{
		depth_calculation_code = "-1.0 + float(2 * layer) / 256.0";

		break;
	}

	default:
	{
		TCU_FAIL("Unrecognized depth internal format");
	}
	} /* switch (m_depth_internal_format) */

	/* Create shader objects */
	m_fragment_shader_id			 = gl.createShader(GL_FRAGMENT_SHADER);
	m_layered_geometry_shader_id	 = gl.createShader(m_glExtTokens.GEOMETRY_SHADER);
	m_non_layered_geometry_shader_id = gl.createShader(m_glExtTokens.GEOMETRY_SHADER);
	m_vertex_shader_id				 = gl.createShader(GL_VERTEX_SHADER);

	GLU_EXPECT_NO_ERROR(gl.getError(), "glCreateShader() call(s) failed.");

	/* Create program objects */
	m_layered_program_id	 = gl.createProgram();
	m_non_layered_program_id = gl.createProgram();

	GLU_EXPECT_NO_ERROR(gl.getError(), "glCreateProgram() call(s) failed");

	/* Build up an array of snippets making up bodies of two geometry shaders
	 * we'll be using for the test.
	 */
	const glw::GLchar* const layered_geometry_shader_parts[] = { m_geometry_shader_code_preamble,
																 m_geometry_shader_code_layered,
																 "    float depth = ", depth_calculation_code,
																 m_geometry_shader_code_body };

	const glw::GLchar* const non_layered_geometry_shader_parts[] = { m_geometry_shader_code_preamble,
																	 m_geometry_shader_code_non_layered,
																	 "    float depth = ", depth_calculation_code,
																	 m_geometry_shader_code_body };

	const glw::GLuint n_layered_geometry_shader_parts =
		sizeof(layered_geometry_shader_parts) / sizeof(layered_geometry_shader_parts[0]);
	const glw::GLuint n_non_layered_geometry_shader_parts =
		sizeof(non_layered_geometry_shader_parts) / sizeof(non_layered_geometry_shader_parts[0]);

	/* Build both programs */
	if (!buildProgram(m_layered_program_id, m_fragment_shader_id, 1, &m_fragment_shader_code,
					  m_layered_geometry_shader_id, n_layered_geometry_shader_parts, layered_geometry_shader_parts,
					  m_vertex_shader_id, 1, &m_vertex_shader_code))
	{
		TCU_FAIL("Could not build layered-case program object");
	}

	if (!buildProgram(m_non_layered_program_id, m_fragment_shader_id, 1, &m_fragment_shader_code,
					  m_non_layered_geometry_shader_id, n_non_layered_geometry_shader_parts,
					  non_layered_geometry_shader_parts, m_vertex_shader_id, 1, &m_vertex_shader_code))
	{
		TCU_FAIL("Could not build non-layered-case program object");
	}

	/* Get location of "uni_layer" uniform */
	m_non_layered_program_id_uni_layer_uniform_location = gl.getUniformLocation(m_non_layered_program_id, "uni_layer");

	if ((-1 == m_non_layered_program_id_uni_layer_uniform_location) || (GL_NO_ERROR != gl.getError()))
	{
		TCU_FAIL("Could not retrieve location of uni_layer uniform for non-layered program");
	}
}
void init()
{
    //std::string racineProjet = "C:/Users/etu/workspace/code/Rendu temps reel/";
    std::string racineProjet = "C:/Users/etu/Documents/GitHub/Gamagora-Rendu_temps_reel-TP/";
    //std::string racineProjet = "B:/Utilisateur/git/code/Gamagora-Rendu_temps_reel-TP/";

	// Build our program and an empty VAO
    gs.programView = buildProgram((racineProjet+(std::string)"basic.vsl").c_str(), (racineProjet+(std::string)"basic.fsl").c_str());


    Mesh m;
    m = ObjManager::loadFromOBJ(Vector3D(0,0,0), (racineProjet+(std::string)"monkey.obj").c_str());

    nbVertex = m.nbface()+6; //nbface + quad "sol"

    float* data = (float*) malloc(nbVertex*4*sizeof(float));
	float* dataNormal = (float*) malloc(nbVertex * 4 * sizeof(float));

    std::vector<Vector3D> vertex = m.getvertex();
    std::vector<int> face = m.getface();
    std::vector<Vector3D> normals = m.getNormals();
    std::vector<int> normalIds = m.getNormalIds();

    int i=0;
    for(int j=0; j<face.size(); j++){
        //set vertex
        data[i] = vertex[face[j]].x;
        data[i+1] = vertex[face[j]].y;
        data[i+2] = vertex[face[j]].z;
        data[i+3] = 1;



        dataNormal[i] = normals[normalIds[j]].x;
        dataNormal[i+1] = normals[normalIds[j]].y;
        dataNormal[i+2] = normals[normalIds[j]].z;
        dataNormal[i+3] = 1;

        i+=4;
    }

    //ajout du quad pour faire le sol
    ajoutSol(Vector3D(-15,-1,-15), Vector3D(15,-1,-15), Vector3D(15,-1,15), Vector3D(-15,-1,15), Vector3D(0,1,0),
                  nbVertex*4, data, dataNormal);



	GLuint buffer;
    glGenBuffers(1, &buffer);
    glBindBuffer(GL_ARRAY_BUFFER, buffer);
    glBufferData(GL_ARRAY_BUFFER, nbVertex*4*4, data, GL_STATIC_DRAW);

    GLuint buffer2;
    glGenBuffers(1, &buffer2);
    glBindBuffer(GL_ARRAY_BUFFER, buffer2);
    glBufferData(GL_ARRAY_BUFFER, nbVertex*4*4, dataNormal, GL_STATIC_READ);

	glCreateVertexArrays(1, &gs.vao);

	glBindVertexArray(gs.vao);

        glBindBuffer(GL_ARRAY_BUFFER, buffer);
        glVertexAttribPointer(12, 4, GL_FLOAT, GL_FALSE, 0, 0);
		glEnableVertexAttribArray(12);

        glBindBuffer(GL_ARRAY_BUFFER, buffer2);
        glVertexAttribPointer(13, 4, GL_FLOAT, GL_FALSE, 0, 0);
        glEnableVertexAttribArray(13);

    glBindVertexArray(0);

    glEnable(GL_DEPTH_TEST);

    // create the depth texture
    glGenTextures(1, &gs.depthTexture);
    glBindTexture(GL_TEXTURE_2D, gs.depthTexture);
	glTexStorage2D(GL_TEXTURE_2D, 1, GL_DEPTH_COMPONENT32F, 640, 480);

    // Framebuffer
    glGenFramebuffers(1, &gs.fbo);
    glBindFramebuffer(GL_FRAMEBUFFER, gs.fbo);
    glFramebufferTexture(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, gs.depthTexture, 0);    

    assert(glCheckFramebufferStatus(GL_FRAMEBUFFER) == GL_FRAMEBUFFER_COMPLETE);
    glBindFramebuffer(GL_FRAMEBUFFER, 0);

	glActiveTexture(GL_TEXTURE0);
	glBindTexture(GL_TEXTURE_2D, gs.depthTexture);

	glBindVertexArray(0);
	free(data); free(dataNormal);
}
Ejemplo n.º 5
0
void fast(unsigned* out_feat,
          Param &x_out,
          Param &y_out,
          Param &score_out,
          Param in,
          const float thr,
          const float feature_ratio,
          const unsigned edge)
{
    try {
        static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
        static std::map<int, Program*> fastProgs;
        static std::map<int, Kernel*>  lfKernel;
        static std::map<int, Kernel*>  nmKernel;
        static std::map<int, Kernel*>  gfKernel;

        int device = getActiveDeviceId();

        std::call_once( compileFlags[device], [device] () {

                std::ostringstream options;
                options << " -D T=" << dtype_traits<T>::getName()
                        << " -D ARC_LENGTH=" << arc_length
                        << " -D NONMAX=" << static_cast<unsigned>(nonmax);

                if (std::is_same<T, double>::value ||
                    std::is_same<T, cdouble>::value) {
                    options << " -D USE_DOUBLE";
                }

                cl::Program prog;
                buildProgram(prog, fast_cl, fast_cl_len, options.str());
                fastProgs[device] = new Program(prog);

                lfKernel[device] = new Kernel(*fastProgs[device], "locate_features");
                nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts");
                gfKernel[device] = new Kernel(*fastProgs[device], "get_features");
            });

        const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio);

        // Matrix containing scores for detected features, scores are stored in the
        // same coordinates as features, dimensions should be equal to in.
        cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float));
        std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0);
        getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]);

        cl::Buffer *d_flags = d_score;
        if (nonmax) {
            d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T));
        }

        const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X);
        const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y);

        // Locate features kernel sizes
        const NDRange local(FAST_THREADS_X, FAST_THREADS_Y);
        const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y);

        auto lfOp = make_kernel<Buffer, KParam,
                                Buffer, const float, const unsigned,
                                LocalSpaceArg> (*lfKernel[device]);

        lfOp(EnqueueArgs(getQueue(), global, local),
             *in.data, in.info, *d_score, thr, edge,
             cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T)));
        CL_DEBUG_FINISH(getQueue());

        const int blk_nonmax_x = divup(in.info.dims[0], 64);
        const int blk_nonmax_y = divup(in.info.dims[1], 64);

        // Nonmax kernel sizes
        const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y);
        const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y);

        unsigned count_init = 0;
        cl::Buffer *d_total = bufferAlloc(sizeof(unsigned));
        getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init);

        //size_t *global_nonmax_dims = global_nonmax();
        size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned);
        cl::Buffer *d_counts  = bufferAlloc(blocks_sz);
        cl::Buffer *d_offsets = bufferAlloc(blocks_sz);

        auto nmOp = make_kernel<Buffer, Buffer, Buffer,
                                Buffer, Buffer,
                                KParam, const unsigned> (*nmKernel[device]);
        nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                         *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge);
        CL_DEBUG_FINISH(getQueue());

        unsigned total;
        getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total);
        total = total < max_feat ? total : max_feat;

        if (total > 0) {
            size_t out_sz = total * sizeof(float);
            x_out.data = bufferAlloc(out_sz);
            y_out.data = bufferAlloc(out_sz);
            score_out.data = bufferAlloc(out_sz);

            auto gfOp = make_kernel<Buffer, Buffer, Buffer,
                                    Buffer, Buffer, Buffer,
                                    KParam, const unsigned,
                                    const unsigned> (*gfKernel[device]);
            gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax),
                             *x_out.data, *y_out.data, *score_out.data,
                             *d_flags, *d_counts, *d_offsets,
                             in.info, total, edge);
            CL_DEBUG_FINISH(getQueue());
        }

        *out_feat = total;

        x_out.info.dims[0] = total;
        x_out.info.strides[0] = 1;
        y_out.info.dims[0] = total;
        y_out.info.strides[0] = 1;
        score_out.info.dims[0] = total;
        score_out.info.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            x_out.info.dims[k] = 1;
            x_out.info.strides[k] = total;
            y_out.info.dims[k] = 1;
            y_out.info.strides[k] = total;
            score_out.info.dims[k] = 1;
            score_out.info.strides[k] = total;
        }

        bufferFree(d_score);
        if (nonmax) bufferFree(d_flags);
        bufferFree(d_total);
        bufferFree(d_counts);
        bufferFree(d_offsets);
    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Ejemplo n.º 6
0
        void approx1(Param out, const Param in, const Param pos, const float offGrid)
        {
            try {
                static std::once_flag compileFlags[DeviceManager::MAX_DEVICES];
                static std::map<int, Program*>  approxProgs;
                static std::map<int, Kernel*> approxKernels;

                int device = getActiveDeviceId();

                std::call_once( compileFlags[device], [device] () {
                    ToNum<Ty> toNum;
                    std::ostringstream options;
                    options << " -D Ty="        << dtype_traits<Ty>::getName()
                            << " -D Tp="        << dtype_traits<Tp>::getName()
                            << " -D ZERO="      << toNum(scalar<Ty>(0));

                    if((af_dtype) dtype_traits<Ty>::af_type == c32 ||
                       (af_dtype) dtype_traits<Ty>::af_type == c64) {
                        options << " -D CPLX=1";
                    } else {
                        options << " -D CPLX=0";
                    }
                    if (std::is_same<Ty, double>::value ||
                        std::is_same<Ty, cdouble>::value) {
                        options << " -D USE_DOUBLE";
                    }

                    switch(method) {
                        case AF_INTERP_NEAREST: options << " -D INTERP=NEAREST";
                            break;
                        case AF_INTERP_LINEAR:  options << " -D INTERP=LINEAR";
                            break;
                        default:
                            break;
                    }
                    Program prog;
                    buildProgram(prog, approx1_cl, approx1_cl_len, options.str());
                    approxProgs[device] = new Program(prog);

                    approxKernels[device] = new Kernel(*approxProgs[device], "approx1_kernel");
                });


                auto approx1Op = make_kernel<Buffer, const KParam, const Buffer, const KParam,
                                       const Buffer, const KParam, const float, const int>
                                      (*approxKernels[device]);

                NDRange local(THREADS, 1, 1);
                int blocksPerMat = divup(out.info.dims[0], local[0]);
                NDRange global(blocksPerMat * local[0] * out.info.dims[1],
                               out.info.dims[2] * out.info.dims[3] * local[0],
                               1);

                approx1Op(EnqueueArgs(getQueue(), global, local),
                          *out.data, out.info, *in.data, in.info,
                          *pos.data, pos.info, offGrid, blocksPerMat);

                CL_DEBUG_FINISH(getQueue());
            } catch (cl::Error err) {
                CL_TO_AF_ERROR(err);
                throw;
            }
        }
Ejemplo n.º 7
0
void csrmm_nt(Param out, const Param &values, const Param &rowIdx,
              const Param &colIdx, const Param &rhs, const T alpha,
              const T beta) {
    bool use_alpha = (alpha != scalar<T>(1.0));
    bool use_beta  = (beta != scalar<T>(0.0));

    // Using greedy indexing is causing performance issues on many platforms
    // FIXME: Figure out why
    bool use_greedy = false;

    std::string ref_name = std::string("csrmm_nt_") +
                           std::string(dtype_traits<T>::getName()) +
                           std::string("_") + std::to_string(use_alpha) +
                           std::string("_") + std::to_string(use_beta) +
                           std::string("_") + std::to_string(use_greedy);

    int device = getActiveDeviceId();

    kc_entry_t entry = kernelCache(device, ref_name);

    if (entry.prog == 0 && entry.ker == 0) {
        std::ostringstream options;
        options << " -D T=" << dtype_traits<T>::getName();
        options << " -D USE_ALPHA=" << use_alpha;
        options << " -D USE_BETA=" << use_beta;
        options << " -D USE_GREEDY=" << use_greedy;
        options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP;

        if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) {
            options << " -D USE_DOUBLE";
        }
        if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) {
            options << " -D IS_CPLX=1";
        } else {
            options << " -D IS_CPLX=0";
        }

        const char *ker_strs[] = {csrmm_cl};
        const int ker_lens[]   = {csrmm_cl_len};

        Program prog;
        buildProgram(prog, 1, ker_strs, ker_lens, options.str());
        entry.prog   = new Program(prog);
        entry.ker    = new Kernel[2];
        entry.ker[0] = Kernel(*entry.prog, "csrmm_nt");
        // FIXME: Change this after adding another kernel
        entry.ker[1] = Kernel(*entry.prog, "csrmm_nt");

        addKernelToCache(device, ref_name, entry);
    }

    auto csrmm_nt_kernel = entry.ker[0];
    auto csrmm_nt_func =
        KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam,
                      T, T, Buffer>(csrmm_nt_kernel);
    NDRange local(THREADS_PER_GROUP, 1);
    int M = rowIdx.info.dims[0] - 1;
    int N = rhs.info.dims[0];

    int groups_x = divup(N, local[0]);
    int groups_y = divup(M, REPEAT);
    groups_y     = std::min(groups_y, MAX_CSRMM_GROUPS);
    NDRange global(local[0] * groups_x, local[1] * groups_y);

    std::vector<int> count(groups_x);
    cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int));
    getQueue().enqueueWriteBuffer(
        *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data());

    csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data,
                  *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data,
                  rhs.info, alpha, beta, *counter);

    bufferFree(counter);
}
Ejemplo n.º 8
0
void conv2Helper(const conv_kparam_t& param, Param out, const Param signal, const Param filter)
{
    try {
        int f0 = filter.info.dims[0];
        int f1 = filter.info.dims[1];

        std::string ref_name =
            std::string("conv2_") +
            std::string(dtype_traits<T>::getName()) +
            std::string("_") +
            std::string(dtype_traits<aT>::getName()) +
            std::string("_") +
            std::to_string(expand) +
            std::string("_") +
            std::to_string(f0) +
            std::string("_") +
            std::to_string(f1);

        int device = getActiveDeviceId();
        kc_t::iterator idx = kernelCaches[device].find(ref_name);

        kc_entry_t entry;
        if (idx == kernelCaches[device].end()) {
            size_t LOC_SIZE = (THREADS_X+2*(f0-1))*(THREADS_Y+2*(f1-1));

            std::ostringstream options;
            options << " -D T=" << dtype_traits<T>::getName()
                    << " -D accType="<< dtype_traits<aT>::getName()
                    << " -D BASE_DIM="<< 2 /* hard constant specific to this convolution type */
                    << " -D FLEN0=" << f0
                    << " -D FLEN1=" << f1
                    << " -D EXPAND="<< expand
                    << " -D C_SIZE="<< LOC_SIZE;
            if (std::is_same<T, double>::value ||
                std::is_same<T, cdouble>::value) {
                options << " -D USE_DOUBLE";
            }
            Program prog;
            buildProgram(prog, convolve_cl, convolve_cl_len, options.str());
            entry.prog   = new Program(prog);
            entry.ker = new Kernel(*entry.prog, "convolve");

            kernelCaches[device][ref_name] = entry;
        } else {
            entry = idx->second;
        }

        auto convOp = cl::KernelFunctor<Buffer, KParam, Buffer, KParam,
                                  Buffer, KParam, int, int,
                                  int, int,
                                  int, int
                                 >(*entry.ker);

        convOp(EnqueueArgs(getQueue(), param.global, param.local),
                *out.data, out.info, *signal.data, signal.info,
                *param.impulse, filter.info, param.nBBS0, param.nBBS1,
                param.o[1], param.o[2], param.s[1], param.s[2]);

    } catch (cl::Error err) {
        CL_TO_AF_ERROR(err);
        throw;
    }
}
Ejemplo n.º 9
0
// setup
int main(int argc, char *argv[])
{
    GLenum err = 0;
    /*********************************************
     * GLFW SETUP
     *********************************************/
    err = glfwInit();
    if (!err)
    {
        fputs("Failed to load the GLFW library", stderr);
        exit(EXIT_FAILURE);
    }

    /*********************************************
     * STATE SETUP (initialize gl context)
     *********************************************/
    // must be setup before glew so that a valid openGL
    // context exists (created with the window)

    w_state = new WorldState();
    c_state.init(*w_state);

    /*********************************************
     * GLEW SETUP
     *********************************************/
    err = glewInit();
    if (err != GLEW_OK)
    {
        fputs("Failed to initialize the GLEW library", stderr);
        exit(EXIT_FAILURE);
    }

    /*********************************************
     * STATE SETUP (construct render states)
     *********************************************/
    // must be setup after glew so that GL array
    // objects exist

    r_state[0] = new RenderState(3);
    r_state[1] = new RenderState(3);

    /*********************************************
     * SHADER SETUP
     *********************************************/
    // read default shaders from file
    GLuint shaderProgram[2] = {0};
    GLuint shaders[2] = {0};

    buildShader(GL_VERTEX_SHADER, "default.vs.glsl", shaders[0]);
    buildShader(GL_FRAGMENT_SHADER, "default.fs.glsl", shaders[1]);

    // create default shader program
    shaderProgram[0] = buildProgram(2, shaders);

    // bind shader program
    w_state->setProgram(0, shaderProgram[0]);
    w_state->useProgram(0);

    // setup the transform matrices and uniform variables
    w_state->loadTransforms();
    w_state->loadLights();
    w_state->loadMaterials();

    /*********************************************
     * LOAD MESH
     *********************************************/
    
    g_mesh = loadMeshFromFile(*r_state[0], "Mesh/arma.obj");

    /*********************************************
     * SET GL STATE
     *********************************************/ 
    glEnable(GL_DEPTH_TEST);

    /*********************************************
     * RENDER LOOP
     *********************************************/
    glfwSetTime(0.0);
    while (!glfwWindowShouldClose(c_state.window))
        display();

    /*********************************************
     * CLEAN UP
     *********************************************/
    delete g_mesh;
    glfwTerminate();

    exit(EXIT_SUCCESS);
}