Пример #1
0
void transpose(Param<T> out, CParam<T> in, const bool conjugate,
               const bool is32multiple) {
    static const std::string source(transpose_cuh, transpose_cuh_len);

    // clang-format off
    auto transpose = getKernel("cuda::transpose", source,
            {
              TemplateTypename<T>(),
              TemplateArg(conjugate),
              TemplateArg(is32multiple)
            },
            {
              DefineValue(TILE_DIM),
              DefineValue(THREADS_Y)
            }
            );
    // clang-format on

    dim3 threads(kernel::THREADS_X, kernel::THREADS_Y);

    int blk_x = divup(in.dims[0], TILE_DIM);
    int blk_y = divup(in.dims[1], TILE_DIM);
    dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);
    const int maxBlocksY =
        cuda::getDeviceProp(getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    EnqueueArgs qArgs(blocks, threads, getActiveStream());

    transpose(qArgs, out, in, blk_x, blk_y);

    POST_LAUNCH_CHECK();
}
Пример #2
0
void DefineFloatConstant(Symbol p)
{
	int align = p->ty->align;

	p->aname = FormatName(".flt%d", FloatNum++);
	
	Align(p);
	Print("%s:\t", p->aname);
	DefineValue(p->ty, p->val);
}