예제 #1
0
backend::kernel key_value_mapping(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        src.kernel("key_value_mapping")
            .open("(")
                .template parameter< size_t >("n");

        boost::mpl::for_each<K>(pointer_param<global_ptr, true>(src, "ikeys"));
        boost::mpl::for_each<K>(pointer_param<global_ptr      >(src, "okeys"));

        src.template parameter< global_ptr<V>       >("ovals");
        src.template parameter< global_ptr<int>     >("offset");
        src.template parameter< global_ptr<const V> >("ivals");
        src.close(")").open("{");

        src.new_line().grid_stride_loop().open("{");

        src.new_line() << "int num_sections = offset[n - 1] + 1;";

        src.new_line() << "int off = offset[idx];";
        src.new_line() << "if (idx < (n - 1) && off != offset[idx + 1])";
        src.open("{");
        for(int p = 0; p < boost::mpl::size<K>::value; ++p)
            src.new_line() << "okeys" << p << "[off] = ikeys" << p << "[idx];";
        src.new_line() << "ovals[off] = ivals[idx];";
        src.close("}");

        src.new_line() << "if (idx == (n - 1))";
        src.open("{");
        for(int p = 0; p < boost::mpl::size<K>::value; ++p)
            src.new_line() << "okeys" << p << "[num_sections - 1] = ikeys" << p << "[idx];";
        src.new_line() << "ovals[num_sections - 1] = ivals[idx];";
        src.close("}");

        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "key_value_mapping");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
예제 #2
0
backend::kernel block_sum_by_key(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_sum_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("key_sum")
                .template parameter< global_ptr<const T>   >("post_sum")
                .template parameter< global_ptr<const int> >("keys")
                .template parameter< global_ptr<T>         >("output")
            .close(")").open("{");

        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";

        src.new_line() << "if (g_id >= n) return;";

        // accumulate prefix
        src.new_line() << "int key2 = keys[ g_id ];";
        src.new_line() << "int key1 = (block > 0    ) ? key_sum[ block - 1 ] : key2 - 1;";
        src.new_line() << "int key3 = (g_id  < n - 1) ? keys   [ g_id  + 1 ] : key2 - 1;";

        src.new_line() << "if (block > 0 && key1 == key2 && key2 != key3)";
        src.open("{");
        src.new_line() << type_name<T>() << " scan_result    = output  [ g_id      ];";
        src.new_line() << type_name<T>() << " post_block_sum = post_sum[ block - 1 ];";
        src.new_line() << "output[ g_id ] = oper( scan_result, post_block_sum );";
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_sum_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
예제 #3
0
backend::kernel offset_calculation(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Comp::define(src, "comp");

        src.kernel("offset_calculation")
            .open("(")
            .template parameter< size_t >("n");

        boost::mpl::for_each<T>(pointer_param<global_ptr, true>(src, "keys"));

        src.template parameter< global_ptr<int> >("offsets");
        src.close(")").open("{");

        src.new_line().grid_stride_loop().open("{");
        src.new_line()
            << "if (idx > 0)"
            << " offsets[idx] = !comp(";
        for(int p = 0; p < boost::mpl::size<T>::value; ++p)
            src << (p ? ", " : "") << "keys" << p << "[idx - 1]";
        for(int p = 0; p < boost::mpl::size<T>::value; ++p)
            src << ", keys" << p << "[idx]";
        src << ");";
        src.new_line() << "else offsets[idx] = 0;";
        src.close("}");
        src.close("}");

        backend::kernel krn(queue, src.str(), "offset_calculation");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
예제 #4
0
backend::kernel block_inclusive_scan_by_key(const backend::command_queue &queue)
{
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_inclusive_scan_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("key_sum")
                .template parameter< global_ptr<const T>   >("pre_sum")
                .template parameter< global_ptr<T>         >("post_sum")
                .template parameter< cl_uint               >("work_per_thread")
            .close(")").open("{");

        src.new_line() << "size_t l_id   = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id   = " << src.global_id(0)  << ";";
        src.new_line() << "size_t wgsz   = " << src.local_size(0) << ";";
        src.new_line() << "size_t map_id = g_id * work_per_thread;";

        src.new_line() << "struct Shared";
        src.open("{");
            src.new_line() << "int keys[" << NT << "];";
            src.new_line() << type_name<T>() << " vals[" << NT << "];";
        src.close("};");

        src.smem_static_var("struct Shared", "shared");

        src.new_line() << "uint offset;";
        src.new_line() << "int  key;";
        src.new_line() << type_name<T>() << " work_sum;";

        src.new_line() << "if (map_id < n)";
        src.open("{");
        src.new_line() << "int prev_key;";

        // accumulate zeroth value manually
        src.new_line() << "offset   = 0;";
        src.new_line() << "key      = key_sum[map_id];";
        src.new_line() << "work_sum = pre_sum[map_id];";

        src.new_line() << "post_sum[map_id] = work_sum;";

        //  Serial accumulation
        src.new_line() << "for( offset = offset + 1; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line() << "prev_key = key;";
        src.new_line() << "key      = key_sum[ map_id + offset ];";

        src.new_line() << "if ( map_id + offset < n )";
        src.open("{");
        src.new_line() << type_name<T>() << " y = pre_sum[ map_id + offset ];";

        src.new_line() << "if ( key == prev_key ) work_sum = oper( work_sum, y );";
        src.new_line() << "else work_sum = y;";

        src.new_line() << "post_sum[ map_id + offset ] = work_sum;";
        src.close("}");
        src.close("}");
        src.close("}");
        src.new_line().barrier();

        // load LDS with register sums
        src.new_line() << "shared.vals[ l_id ] = work_sum;";
        src.new_line() << "shared.keys[ l_id ] = key;";

        // scan in lds
        src.new_line() << type_name<T>() << " scan_sum = work_sum;";

        src.new_line() << "for( offset = 1; offset < wgsz; offset *= 2 )";
        src.open("{");
        src.new_line().barrier();

        src.new_line() << "if (map_id < n)";
        src.open("{");
        src.new_line() << "if (l_id >= offset)";
        src.open("{");
        src.new_line() << "int key1 = shared.keys[ l_id ];";
        src.new_line() << "int key2 = shared.keys[ l_id - offset ];";

        src.new_line() << "if ( key1 == key2 ) scan_sum = oper( scan_sum, shared.vals[ l_id - offset ] );";
        src.new_line() << "else scan_sum = shared.vals[ l_id ];";
        src.close("}");

        src.close("}");
        src.new_line().barrier();

        src.new_line() << "shared.vals[ l_id ] = scan_sum;";
        src.close("}");

        src.new_line().barrier();

        // write final scan from pre-scan and lds scan
        src.new_line() << "for( offset = 0; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line().barrier(true);

        src.new_line() << "if (map_id < n && l_id > 0)";
        src.open("{");
        src.new_line() << type_name<T>() << " y = post_sum[ map_id + offset ];";
        src.new_line() << "int key1 = key_sum    [ map_id + offset ];";
        src.new_line() << "int key2 = shared.keys[ l_id - 1 ];";

        src.new_line() << "if ( key1 == key2 ) y = oper( y, shared.vals[l_id - 1] );";

        src.new_line() << "post_sum[ map_id + offset ] = y;";
        src.close("}");
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_inclusive_scan_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
예제 #5
0
backend::kernel block_scan_by_key(const backend::command_queue &queue) {
    static detail::kernel_cache cache;

    auto cache_key = backend::cache_key(queue);
    auto kernel    = cache.find(cache_key);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_scan_by_key")
            .open("(")
                .template parameter< size_t                >("n")
                .template parameter< global_ptr<const int> >("keys")
                .template parameter< global_ptr<const T>   >("vals")
                .template parameter< global_ptr<T>         >("output")
                .template parameter< global_ptr<int>       >("key_buf")
                .template parameter< global_ptr<T>         >("val_buf")
            .close(")").open("{");

        src.new_line() << "size_t l_id  = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";
        src.new_line() << "size_t wgsz  = " << src.local_size(0) << ";";

        src.new_line() << "struct Shared";
        src.open("{");
            src.new_line() << "int keys[" << NT << "];";
            src.new_line() << type_name<T>() << " vals[" << NT << "];";
        src.close("};");

        src.smem_static_var("struct Shared", "shared");

        src.new_line() << "int key;";
        src.new_line() << type_name<T>() << " val;";

        src.new_line() << "if (g_id < n)";
        src.open("{");
        src.new_line() << "key = keys[g_id];";
        src.new_line() << "val = vals[g_id];";
        src.new_line() << "shared.keys[l_id] = key;";
        src.new_line() << "shared.vals[l_id] = val;";
        src.close("}");

        // Computes a scan within a workgroup updates vals in lds but not keys
        src.new_line() << type_name<T>() << " sum = val;";
        src.new_line() << "for(size_t offset = 1; offset < wgsz; offset *= 2)";
        src.open("{");
        src.new_line().barrier();
        src.new_line() << "if (l_id >= offset && shared.keys[l_id - offset] == key)";
        src.open("{");
        src.new_line() << "sum = oper(sum, shared.vals[l_id - offset]);";
        src.close("}");
        src.new_line().barrier();
        src.new_line() << "shared.vals[l_id] = sum;";
        src.close("}");
        src.new_line().barrier();

        src.new_line() << "if (g_id >= n) return;";

        // Each work item writes out its calculated scan result, relative to the
        // beginning of each work group
        src.new_line() << "int key2 = -1;";
        src.new_line() << "if (g_id < n - 1) key2 = keys[g_id + 1];";
        src.new_line() << "if (key != key2) output[g_id] = sum;";

        src.new_line() << "if (l_id == 0)";
        src.open("{");
        src.new_line() << "key_buf[block] = shared.keys[wgsz - 1];";
        src.new_line() << "val_buf[block] = shared.vals[wgsz - 1];";
        src.close("}");

        src.close("}");

        backend::kernel krn(queue, src.str(), "block_scan_by_key");
        kernel = cache.insert(std::make_pair(cache_key, krn)).first;
    }

    return kernel->second;
}
예제 #6
0
파일: scan.hpp 프로젝트: DingKe/vexcl
backend::kernel block_inclusive_scan(const backend::command_queue &queue)
{
    static detail::kernel_cache cache;

    auto kernel = cache.find(queue);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_inclusive_scan")
            .open("(")
                .template parameter< size_t              >("n")
                .template parameter< global_ptr<const T> >("input")
                .template parameter< T                   >("identity")
                .template parameter< global_ptr<T>       >("scan_buf1")
                .template parameter< global_ptr<T>       >("scan_buf2")
                .template parameter< int                 >("exclusive")
            .close(")").open("{");

        src.new_line() << "size_t l_id  = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";

        src.new_line() << "size_t offset = 1;";

        {
            std::ostringstream shared;
            shared << "shared[" << 2 * NT << "]";
            src.smem_static_var(type_name<T>(), shared.str());
        }

        // load input into shared memory
        src.new_line()
            << "if(block * " << 2 * NT << " + l_id < n)"
            << " shared[l_id] = input[block * " << 2 * NT << " + l_id];";

        src.new_line()
            << "if(block * " << 2 * NT << " + l_id + " << NT << " < n)"
            << " shared[l_id + " << NT << "] ="
            << " input[block * " << 2 * NT << " + l_id + " << NT << "];";

        // Exclusive case
        src.new_line()
            << "if(exclusive && g_id == 0)"
            << " shared[l_id] = oper(identity, input[0]);";

        src.new_line() << "for (size_t start = " << NT << "; start > 0; start >>= 1, offset *= 2)";
        src.open("{");
        src.new_line().barrier();

        src.new_line() << "if (l_id < start)";
        src.open("{");
        src.new_line() << "size_t temp1 = offset * (2 * l_id + 1) - 1;";
        src.new_line() << "size_t temp2 = offset * (2 * l_id + 2) - 1;";
        src.new_line() << type_name<T>() << " y2 = shared[temp2];";
        src.new_line() << type_name<T>() << " y1 = shared[temp1];";
        src.new_line() << "shared[temp2] = oper(y2, y1);";
        src.close("}");

        src.close("}");
        src.new_line().barrier();

        src.new_line() << "if (l_id == 0)";
        src.open("{");
        src.new_line() << "scan_buf1[ block ] = shared[" << NT * 2 - 1 << "];";
        src.new_line() << "scan_buf2[ block ] = shared[" << NT - 1 << "];";
        src.close("}");
        src.close("}");

        kernel = cache.insert(queue, backend::kernel(
                    queue, src.str(), "block_inclusive_scan"));
    }

    return kernel->second;
}
예제 #7
0
파일: scan.hpp 프로젝트: DingKe/vexcl
backend::kernel block_addition(
        const backend::command_queue &queue)
{
    static detail::kernel_cache cache;

    auto kernel = cache.find(queue);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("block_addition")
            .open("(")
                .template parameter< size_t              >("n")
                .template parameter< global_ptr<const T> >("input")
                .template parameter< global_ptr<T>       >("output")
                .template parameter< global_ptr<T>       >("post_sum")
                .template parameter< global_ptr<T>       >("pre_sum")
                .template parameter< T                   >("identity")
                .template parameter< int                 >("exclusive")
            .close(")").open("{");

        src.new_line() << "size_t l_id  = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id  = " << src.global_id(0)  << ";";
        src.new_line() << "size_t block = " << src.group_id(0)   << ";";

        src.new_line() << type_name<T>() << " val;";

        {
            std::ostringstream shared;
            shared << "shared[" << NT << "]";
            src.smem_static_var(type_name<T>(), shared.str());
        }

        src.new_line() << "if (g_id < n)";
        src.open("{");
        src.new_line() << "if (exclusive) val = g_id > 0 ? input[g_id - 1] : identity;";
        src.new_line() << "else val = input[g_id];";
        src.close("}");
        src.new_line() << "shared[l_id] = val;";

        src.new_line() << type_name<T>() << " scan_result = val;";
        src.new_line() << type_name<T>() << " post_block_sum, new_result;";
        src.new_line() << type_name<T>() << " y1, y2, sum;";

        src.new_line() << "if(l_id == 0 && g_id < n)";
        src.open("{");
        src.new_line() << "if(block > 0)";
        src.open("{");
        src.new_line() << "if(block % 2 == 0)  post_block_sum = post_sum[ block/2 - 1 ];";
        src.new_line() << "else if(block == 1) post_block_sum = pre_sum[0];";
        src.new_line() << "else";
        src.open("{");
        src.new_line() << "y1 = post_sum[ block/2 - 1 ];";
        src.new_line() << "y2 = pre_sum [ block/2];";
        src.new_line() << "post_block_sum = oper(y1, y2);";
        src.close("}");
        src.new_line() << "new_result = exclusive ? post_block_sum : oper( scan_result, post_block_sum );";
        src.close("}");
        src.new_line() << "else new_result = scan_result;";

        src.new_line() << "shared[ l_id ] = new_result;";
        src.close("}");

        //  Computes a scan within a workgroup
        src.new_line() << "sum = shared[ l_id ];";
        src.new_line() << "for( size_t offset = 1; offset < " << NT << "; offset *= 2 )";
        src.open("{");
        src.new_line().barrier();
        src.new_line() << "if (l_id >= offset) sum = oper( sum, shared[ l_id - offset ] );";
        src.new_line().barrier();
        src.new_line() << "shared[ l_id ] = sum;";
        src.close("}");
        src.new_line().barrier();
        src.new_line() << "if(g_id < n) output[ g_id ] = sum;";

        src.close("}");

        kernel = cache.insert(queue, backend::kernel(
                    queue, src.str(), "block_addition"));
    }

    return kernel->second;
}
예제 #8
0
파일: scan.hpp 프로젝트: DingKe/vexcl
backend::kernel intra_block_inclusive_scan(const backend::command_queue &queue)
{
    static detail::kernel_cache cache;

    auto kernel = cache.find(queue);

    if (kernel == cache.end()) {
        backend::source_generator src(queue);

        Oper::define(src, "oper");

        src.kernel("intra_block_inclusive_scan")
            .open("(")
                .template parameter< size_t              >("n")
                .template parameter< global_ptr<T>       >("post_sum")
                .template parameter< global_ptr<const T> >("pre_sum")
                .template parameter< T                   >("identity")
                .template parameter< uint                >("work_per_thread")
            .close(")").open("{");

        src.new_line() << "size_t l_id   = " << src.local_id(0)   << ";";
        src.new_line() << "size_t g_id   = " << src.global_id(0)  << ";";
        src.new_line() << "size_t map_id = g_id * work_per_thread;";

        {
            std::ostringstream shared;
            shared << "shared[" << NT << "]";
            src.smem_static_var(type_name<T>(), shared.str());
        }

        src.new_line() << "size_t offset;";
        src.new_line() << type_name<T>() << " work_sum;";

        src.new_line() << "if (map_id < n)";
        src.open("{");

        // accumulate zeroth value manually
        src.new_line() << "offset = 0;";
        src.new_line() << "work_sum = pre_sum[map_id];";

        //  Serial accumulation
        src.new_line() << "for( offset = 1; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line()
            << "if (map_id + offset < n)"
            << " work_sum = oper( work_sum, pre_sum[map_id + offset] );";
        src.close("}");
        src.close("}");
        src.new_line().barrier();

        src.new_line() << type_name<T>() << " scan_sum = work_sum;";
        src.new_line() << "shared[ l_id ] = work_sum;";

        // scan in shared
        src.new_line() << "for( offset = 1; offset < " << NT << "; offset *= 2 )";
        src.open("{");
        src.new_line().barrier();

        src.new_line()
            << "if (map_id < n && l_id >= offset)"
            << " scan_sum = oper( scan_sum, shared[ l_id - offset ] );";
        src.new_line().barrier();
        src.new_line() << "shared[ l_id ] = scan_sum;";
        src.close("}");
        src.new_line().barrier();

        // write final scan from pre-scan and shared scan
        src.new_line() << "work_sum = pre_sum[map_id];";
        src.new_line() << "if (l_id > 0)";
        src.open("{");
        src.new_line() << "    work_sum = oper(work_sum, shared[l_id - 1]);";
        src.new_line() << "    post_sum[map_id] = work_sum;";
        src.close("}");
        src.new_line() << "else post_sum[map_id] = work_sum;";

        src.new_line() << "for( offset = 1; offset < work_per_thread; ++offset )";
        src.open("{");
        src.new_line().barrier();

        src.new_line() << "if (map_id < n && l_id > 0)";
        src.open("{");
        src.new_line() << type_name<T>() << " y = oper(pre_sum[map_id + offset], work_sum);";
        src.new_line() << "post_sum[ map_id + offset ] = y;";
        src.new_line() << "work_sum = y;";
        src.close("}");
        src.new_line() << "else";
        src.open("{");
        src.new_line() << "post_sum[map_id + offset] = oper(pre_sum[map_id + offset], work_sum);";
        src.new_line() << "work_sum = post_sum[map_id + offset];";
        src.close("}");
        src.close("}");
        src.close("}");

        kernel = cache.insert(queue, backend::kernel(
                    queue, src.str(), "intra_block_inclusive_scan"));
    }

    return kernel->second;
}