//****************************************************************************** // Private Interface //****************************************************************************** MEMORY_POOL::SEGMENT* MEMORY_POOL::GetNewSegment() { SEGMENT* NewSegment = (SEGMENT*)malloc( AllocationSize ); NewSegment->Memory = (void*)alignup( (u8*)(NewSegment) + sizeof(SEGMENT), BlockSize ); NewSegment->FreeBlock = (BLOCK*)NewSegment->Memory; NewSegment->NextFreeSegment = null; NewSegment->Left = null; NewSegment->Right = null; NewSegment->Key = (u64)(NewSegment->Memory); SegmentTree.Insert(NewSegment); BLOCK* Block = null; BLOCK* NextBlock = null; int NumBlocks = ( AllocationSize - alignup( sizeof(SEGMENT), BlockSize ) ) / BlockSize; for( int i = 0; i < NumBlocks - 1; i++ ) { Block = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( i ) ); NextBlock = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( i + 1 ) ); Block->Next = NextBlock; } Block = (BLOCK*)( (u8*)(NewSegment->Memory) + BlockSize*( NumBlocks - 1 ) ); Block->Next = null; AddToFreeList(NewSegment); NumSegments++; return NewSegment; }
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; const auto dev = qdev(queue); kernel_common<T>(o, dev); // determine max block size to fit into local memory/workgroup size_t block_size = 128; { const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o << "__kernel void transpose(" << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n" << " const size_t " << " global_x = get_global_id(0), global_y = get_global_id(1),\n" << " local_x = get_local_id(0), local_y = get_local_id(1),\n" << " group_x = get_group_id(0), group_y = get_group_id(1),\n" << " block_size = " << block_size << ",\n" << " target_x = local_y + group_y * block_size,\n" << " target_y = local_x + group_x * block_size;\n" << " const bool range = global_x < width && global_y < height;\n" // local memory << " __local real2_t block[" << (block_size * block_size) << "];\n" // copy from input to local memory << " if(range)\n" << " block[local_x + local_y * block_size] = input[global_x + global_y * width];\n" // wait until the whole block is filled << " barrier(CLK_LOCAL_MEM_FENCE);\n" // transpose local block to target << " if(range)\n" << " output[target_x + target_y * height] = block[local_x + local_y * block_size];\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "transpose"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(width)); kernel.setArg(3, static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = alignup(width, block_size); size_t r_h = alignup(height, block_size); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h), cl::NDRange(block_size, block_size)); }
// TODO: check void* rpallocEx(rpheap_t _heap, size_t _cb, size_t _align) { if (_heap == NULL) _heap = default_heap; if (_cb == 0) _cb = DEFAULT_ALIGN; RP_HEAP_HEADER* rhhHeap = _heap; // check initial (still not guaranteed to be room for allocation) if (rhhHeap->cbFree < _cb) return errno = ENOMEM, NULL; const void* const pHeapEnd = (char*)rhhHeap->pBase + rhhHeap->cbHeap; // local loop vars register RP_BLOCK_HEADER* rbhCurBlock; const size_t cbPotentialBlock = alignup(_cb, _align); // loop control vars register void* pCurBlock = rhhHeap->pBase; for (; pCurBlock < pHeapEnd; pCurBlock = rbhCurBlock->pNext) { rbhCurBlock = pCurBlock; register char* pPotentialBlock = (char*)((uintptr_t)rbhCurBlock + rbhCurBlock->cbBlock); // check if we can align up pPotentialBlock = alignup(pPotentialBlock, _align); // if there isn't room register void* pVirtualMax = rbhCurBlock->pNext == NULL ? pHeapEnd : rbhCurBlock->pNext; if (pPotentialBlock > pVirtualMax || pPotentialBlock + cbPotentialBlock > pVirtualMax) continue; pCurBlock = pPotentialBlock; break; }; if (pCurBlock >= pHeapEnd) return errno = ENOMEM, NULL; // put new header at location register RP_BLOCK_HEADER* rbhNewBlock = pCurBlock; rbhNewBlock->cbBlock = cbPotentialBlock; rbhNewBlock->pNext = rbhCurBlock->pNext; // update old references rbhCurBlock->pNext = rbhNewBlock; }
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n" << " const size_t x = get_global_id(0), y = get_global_id(1);\n" << " if(x < stride) {\n" << " const size_t off = x + stride * y;" << " output[off] = mul(data[off], exp[x]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(n)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t threads = alignup(n, wg); std::ostringstream desc; desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
//****************************************************************************** MEMORY_POOL::MEMORY_POOL(u32 _BlockSize, u32 _AllocationSize) { BlockSize = max(sizeof(BLOCK), _BlockSize); AllocationSize = _AllocationSize; FreeSegments = null; NumBlocks = 0; NumSegments = 0; // Even this is far too generous of an assertion assert( BlockSize < AllocationSize / 2 ); BlockSize = alignup( _BlockSize, sizeof(BLOCK) ); DataSize = AllocationSize - alignup( sizeof(SEGMENT), BlockSize ); GetNewSegment(); }
//***************************************************************************** void Memcpy(void* dest, const void* src, size_t size) { char* Dest; char* Src; char* DestEnd; u64* Dest2; u64* Src2; u64* DestEnd2; // Copy over bytes until we are 8-byte aligned Dest = (char*)dest; Src = (char*)src; DestEnd = (char*)alignup(dest, 8); while(Dest < DestEnd) { *Dest++ = *Src++; } // Do large copies (8 bytes at a time) Dest2 = (u64*)Dest; Src2 = (u64*)Src; DestEnd2 = (u64*)aligndown(dest + size, 8); while(Dest2 < DestEnd2) { *Dest2++ = *Src2++; } // Finish it up (anything on the tail end thats not 8-byte aligned) Dest = (char*)Dest2; Src = (char*)Src2; DestEnd = (char*)dest + size; while(Dest < DestEnd) { *Dest++ = *Src++; } }
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; o << std::setprecision(25); const auto device = qdev(queue); kernel_common<T>(o, device); mul_code(o, invert); twiddle_code<T>(o); const size_t m = n / radix.value; kernel_radix<T>(o, radix, invert); auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math"); cl::Kernel kernel(program, "radix"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(p)); kernel.setArg(3, static_cast<cl_uint>(m)); const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device); //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); size_t wg = wg_mul; //while(wg * max_cu < max_wg) wg += wg_mul; //wg -= wg_mul; const size_t threads = alignup(m, wg); std::ostringstream desc; desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}"; return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "reduce_by_key is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); twiddle_code<T>(o); o << "__kernel void bluestein_mul_in(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "uint radix, uint p, uint out_stride) {\n" << " const size_t\n" << " thread = get_global_id(0), threads = get_global_size(0),\n" << " batch = get_global_id(1),\n" << " element = get_global_id(2);\n" << " if(element < out_stride) {\n" << " const size_t\n" << " in_off = thread + batch * radix * threads + element * threads,\n" << " out_off = thread * out_stride + batch * out_stride * threads + element;\n" << " if(element < radix) {\n" << " real2_t w = exp[element];" << " if(p != 1) {\n" << " const int sign = " << (inverse ? "+1" : "-1") << ";\n" << " ulong a = (ulong)element * (thread % p);\n" << " ulong b = (ulong)radix * p;\n" << " real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n" << " w = mul(w, t);\n" << " }\n" << " output[out_off] = mul(data[in_off], w);\n" << " } else\n" << " output[out_off] = (real2_t)(0,0);" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_in"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(radix)); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t stride_pad = alignup(stride, wg); std::ostringstream desc; desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg)); }
std::vector<size_t> partitioning_scheme<dummy>::get(size_t n, const std::vector<cl::CommandQueue> &queue) { if (!is_set) { weight = device_vector_perf; is_set = true; } std::vector<size_t> part; part.reserve(queue.size() + 1); part.push_back(0); if (queue.size() > 1) { std::vector<double> cumsum; cumsum.reserve(queue.size() + 1); cumsum.push_back(0); for(auto q = queue.begin(); q != queue.end(); q++) { cl::Context context = q->getInfo<CL_QUEUE_CONTEXT>(); cl::Device device = q->getInfo<CL_QUEUE_DEVICE>(); auto dw = device_weight.find(device()); double w = (dw == device_weight.end()) ? (device_weight[device()] = weight(context, device)) : dw->second; cumsum.push_back(cumsum.back() + w); } for(uint d = 1; d < queue.size(); d++) part.push_back( std::min(n, alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back())) ) ); } part.push_back(n); return part; }
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul_out(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "real_t div, uint p, uint in_stride, uint radix) {\n" << " const size_t\n" << " i = get_global_id(0), threads = get_global_size(0),\n" << " b = get_global_id(1),\n" << " l = get_global_id(2);\n" << " if(l < radix) {\n" << " const size_t\n" << " k = i % p,\n" << " j = k + (i - k) * radix,\n" << " in_off = i * in_stride + b * in_stride * threads + l,\n" << " out_off = j + b * threads * radix + l * p;\n" << " output[out_off] = mul(data[in_off] * div, exp[l]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_out"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg<T>(3, static_cast<T>(1) / stride); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); kernel.setArg(6, static_cast<cl_uint>(radix)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t radix_pad = alignup(radix, wg); std::ostringstream desc; desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg)); }
/* * _ malloc() point * / * / _ aligned point * / / * +----+-----+--------------------------------------- * | | | * +----+--|--+--------------------------------------- * ^ | * | | embeded address of malloc(). * +-------+ * */ void * ri_mem_alloc_aligned(size_t sz, uint32_t align) { uint64_t size; void *p; void *aligned; uint64_t diff; assert(align > 0); assert(align % 16 == 0); size = alignup((uint64_t)sz, align); size += 8; /* room for embed the address */ p = malloc(size); aligned = alignptr(p, align); if (aligned == p) { aligned += align; // align up } diff = (uintptr_t)(aligned - p); /* printf("malloc point : %p\n", p); printf("aligned point : %p\n", aligned); printf("diff : %llu\n", diff); */ /* * Embed malloc() address */ *((uint64_t *)aligned - 1) = (uintptr_t)p; assert(aligned != NULL); return aligned; }
std::vector<size_t> partitioning_scheme<dummy>::get(size_t n, const std::vector<backend::command_queue> &queue) { static const bool once = init_weight_function(); (void)once; // do not warn about unused variable std::vector<size_t> part; part.reserve(queue.size() + 1); part.push_back(0); if (queue.size() > 1) { std::vector<double> cumsum; cumsum.reserve(queue.size() + 1); cumsum.push_back(0); for(auto q = queue.begin(); q != queue.end(); q++) { auto dev_id = backend::get_device_id(*q); auto dw = device_weight.find(dev_id); double w = (dw == device_weight.end()) ? (device_weight[dev_id] = weight(*q)) : dw->second; cumsum.push_back(cumsum.back() + w); } for(unsigned d = 1; d < queue.size(); d++) part.push_back( std::min(n, alignup(static_cast<size_t>(n * cumsum[d] / cumsum.back())) ) ); } part.push_back(n); return part; }
int main(int argc, char **argv) { int help=0; int arg=1; gasnet_handlerentry_t htable[] = { { hidx_ping_shorthandler, ping_shorthandler }, { hidx_pong_shorthandler, pong_shorthandler }, { hidx_ping_medhandler, ping_medhandler }, { hidx_pong_medhandler, pong_medhandler }, { hidx_ping_longhandler, ping_longhandler }, { hidx_pong_longhandler, pong_longhandler }, { hidx_ping_shorthandler_flood, ping_shorthandler_flood }, { hidx_pong_shorthandler_flood, pong_shorthandler_flood }, { hidx_ping_medhandler_flood, ping_medhandler_flood }, { hidx_pong_medhandler_flood, pong_medhandler_flood }, { hidx_ping_longhandler_flood, ping_longhandler_flood }, { hidx_pong_longhandler_flood, pong_longhandler_flood }, { hidx_done_shorthandler, done_shorthandler } }; GASNET_Safe(gasnet_init(&argc, &argv)); mynode = gasnet_mynode(); numnode = gasnet_nodes(); arg = 1; while (argc > arg) { if (!strcmp(argv[arg], "-p")) { #if GASNET_PAR pollers = test_thread_limit(atoi(argv[arg+1])+1)-1; arg += 2; #else if (0 == mynode) { fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING); fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n"); fflush(NULL); } sleep(1); gasnet_exit(1); #endif } else if (!strcmp(argv[arg], "-in")) { insegment = 1; ++arg; } else if (!strcmp(argv[arg], "-out")) { insegment = 0; ++arg; } else if (!strcmp(argv[arg], "-c")) { crossmachinemode = 1; ++arg; } else if (!strcmp(argv[arg], "-src-noop")) { src_mode = SRC_NOOP; ++arg; } else if (!strcmp(argv[arg], "-src-generate")) { src_mode = SRC_GENERATE; ++arg; } else if (!strcmp(argv[arg], "-src-memcpy")) { src_mode = SRC_MEMCPY; ++arg; } else if (argv[arg][0] == '-') { help = 1; ++arg; } else break; } if (argc > arg) { iters = atoi(argv[arg]); ++arg; } if (!iters) iters = 1000; if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; } if (!maxsz) maxsz = 2*1024*1024; if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; } GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t), TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET)); #if GASNET_PAR #define PAR_USAGE \ " The -p option gives the number of polling threads, specified as\n" \ " a non-negative integer argument (default is no polling threads).\n" #else #define PAR_USAGE "" #endif test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n" " The '-in' or '-out' option selects whether the requestor's\n" " buffer is in the GASNet segment or not (default is 'in').\n" PAR_USAGE " The '-src-*' options select treatment of the payload buffer used for\n" " Medium and Long AMs, as follows:\n" " -src-noop: no per-operation initialization (default)\n" " -src-generate: initialized (w/o memory reads) on each AM injection\n" " -src-memcpy: initialized using memcpy() on each AM injection\n" " The -c option enables cross-machine pairing (default is nearest neighbor).\n"); if (help || argc > arg) test_usage(); TEST_PRINT_CONDUITINFO(); if (insegment) { myseg = TEST_MYSEG(); } else { char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ); myseg = alignup_ptr(space, PAGESZ); } maxmed = MIN(maxsz, gasnet_AMMaxMedium()); maxlongreq = MIN(maxsz, gasnet_AMMaxLongRequest()); maxlongrep = MIN(maxsz, gasnet_AMMaxLongReply()); if (src_mode == SRC_MEMCPY) { zero_buffer = test_calloc(maxsz, 1); } if (crossmachinemode) { if ((numnode%2) && (mynode == numnode-1)) { sender = 1; peer = mynode; } else { gasnet_node_t half = numnode / 2; sender = (mynode < half); peer = sender ? (mynode + half) : (mynode - half); } } else { peer = mynode ^ 1; sender = mynode % 2 == 0; if (peer == numnode) { peer = mynode; } } recvr = !sender || (peer == mynode); // Long Request and Reply (distinct for loopback) reply_addr = TEST_SEG(peer); request_addr = (peer == mynode) ? (void*)((uintptr_t)reply_addr + alignup(maxsz,SIZEOF_GASNET_REGISTER_VALUE_T)) : reply_addr; BARRIER(); #if GASNET_PAR #define PAR_FMT " %i extra recvr polling threads\n" #define PAR_ARG ,pollers #else #define PAR_FMT /*empty*/ #define PAR_ARG /*empty*/ #endif if (mynode == 0) { printf("Running %i iterations of %s AM performance with:\n" " local addresses %sside the segment%s\n" " %s\n" PAR_FMT " ...\n", iters, (crossmachinemode ? "cross-machine ": ""), (insegment ? "in" : "out"), (insegment ? " (default)" : ""), ((src_mode == SRC_NOOP) ? "no payload initialization (default)" :(src_mode == SRC_GENERATE) ? "payload initialized by computation" : "payload initialized using memcpy()") PAR_ARG ); printf(" Msg Sz Description Total time Avg. time Bandwidth\n" " ------ ----------- ---------- --------- ---------\n"); fflush(stdout); } #if GASNET_PAR TEST_SET_WAITMODE(pollers+1); if (pollers) test_createandjoin_pthreads(pollers+1,doAll,NULL,0); else #endif doAll(NULL); MSG("done."); gasnet_exit(0); return 0; }
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "Sorting is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = detail::offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;"); detail::scan(queue[0], offset, offset, 0, false, plus); /***** Kernel 1 *****/ auto krn1 = is_cpu(queue[0]) ? detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn1.push_arg(count); krn1.push_arg(offset); krn1.push_arg(ivals(0)); krn1.push_arg(offset_val); krn1.push_arg(key_sum); krn1.push_arg(pre_sum); krn1.config(num_blocks, NT); krn1(queue[0]); /***** Kernel 2 *****/ uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); auto krn2 = is_cpu(queue[0]) ? detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn2.push_arg(num_blocks); krn2.push_arg(key_sum); krn2.push_arg(pre_sum); krn2.push_arg(post_sum); krn2.push_arg(work_per_thread); krn2.config(1, NT); krn2(queue[0]); /***** Kernel 3 *****/ auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]); krn3.push_arg(count); krn3.push_arg(key_sum); krn3.push_arg(post_sum); krn3.push_arg(offset); krn3.push_arg(offset_val); krn3.config(num_blocks, NT); krn3(queue[0]); /***** resize okeys and ovals *****/ int out_elements; offset.read(queue[0], count - 1, 1, &out_elements, true); ++out_elements; boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements)); ovals.resize(ivals.queue_list(), out_elements); /***** Kernel 4 *****/ auto krn4 = detail::key_value_mapping<K, V>(queue[0]); krn4.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn4)); boost::fusion::for_each(okeys, do_push_arg(krn4)); krn4.push_arg(ovals(0)); krn4.push_arg(offset); krn4.push_arg(offset_val); krn4(queue[0]); return out_elements; }
ell( const std::vector<backend::command_queue> &q, size_t nrows, size_t ncols, const PtrRange &ptr, const ColRange &col, const ValRange &val, bool fast = true ) : q(q[0]), n(nrows), m(ncols), nnz(boost::size(val)), ell_pitch(alignup(nrows, 16U)), csr_nnz(0) { precondition(q.size() == 1, "sparse::ell is only supported for single-device contexts"); if (fast) { convert(ptr, col, val); return; } /* 1. Get optimal ELL widths for local and remote parts. */ // Speed of ELL relative to CSR: const double ell_vs_csr = 3.0; // Find maximum widths for local and remote parts: int max_width = 0; for(size_t i = 0; i < n; ++i) max_width = std::max(max_width, static_cast<int>(ptr[i+1] - ptr[i])); // Build width distribution histogram. std::vector<size_t> hist(max_width + 1, 0); for(size_t i = 0; i < n; ++i) ++hist[ptr[i+1] - ptr[i]]; // Estimate optimal width for ELL part of the matrix. ell_width = max_width; { size_t rows = n; for(int i = 0; i < max_width; ++i) { rows -= hist[i]; // Number of rows wider than i. if (ell_vs_csr * rows < n) { ell_width = i; break; } } } if (ell_width == 0) { csr_nnz = nnz; csr_ptr = backend::device_vector<Col>(q[0], n + 1, &ptr[0]); csr_col = backend::device_vector<Col>(q[0], csr_nnz, &col[0]); csr_val = backend::device_vector<Val>(q[0], csr_nnz, &val[0]); return; } // Count nonzeros in CSR part of the matrix. for(int i = ell_width + 1; i <= max_width; ++i) csr_nnz += hist[i] * (i - ell_width); /* 3. Split the input matrix into ELL and CSR submatrices. */ std::vector<Col> _ell_col(ell_pitch * ell_width, static_cast<Col>(-1)); std::vector<Val> _ell_val(ell_pitch * ell_width); std::vector<Ptr> _csr_ptr; std::vector<Col> _csr_col; std::vector<Val> _csr_val; if (csr_nnz) { _csr_ptr.resize(n + 1); _csr_col.resize(csr_nnz); _csr_val.resize(csr_nnz); _csr_ptr[0] = 0; for(size_t i = 0; i < n; ++i) { Ptr w = ptr[i+1] - ptr[i]; _csr_ptr[i+1] = _csr_ptr[i] + static_cast<Ptr>(w > ell_width ? w - ell_width : 0); } } for(size_t i = 0; i < n; ++i) { int w = 0; Ptr csr_head = csr_nnz ? _csr_ptr[i] : 0; for(Ptr j = ptr[i], e = ptr[i+1]; j < e; ++j, ++w) { Col c = col[j]; Val v = val[j]; if (w < ell_width) { _ell_col[i + w * ell_pitch] = c; _ell_val[i + w * ell_pitch] = v; } else { _csr_col[csr_head] = c; _csr_val[csr_head] = v; ++csr_head; } } } ell_col = backend::device_vector<Col>(q[0], ell_pitch * ell_width, _ell_col.data()); ell_val = backend::device_vector<Val>(q[0], ell_pitch * ell_width, _ell_val.data()); if (csr_nnz) { csr_ptr = backend::device_vector<Col>(q[0], n + 1, _csr_ptr.data()); csr_col = backend::device_vector<Col>(q[0], csr_nnz, _csr_col.data()); csr_val = backend::device_vector<Val>(q[0], csr_nnz, _csr_val.data()); } }
int main(int argc, char **argv) { /* call startup */ GASNET_Safe(gasnet_init(&argc, &argv)); /* parse arguments */ arg = 1; while (argc > arg) { if (!strcmp(argv[arg], "-in")) { insegment = 1; ++arg; } else if (!strcmp(argv[arg], "-out")) { insegment = 0; ++arg; } else if (!strcmp(argv[arg], "-f")) { firstlastmode = 1; ++arg; } else if (!strcmp(argv[arg], "-a")) { fullduplexmode = 1; ++arg; } else if (!strcmp(argv[arg], "-p")) { do_puts = 1; numflavors++; ++arg; } else if (!strcmp(argv[arg], "-g")) { do_gets = 1; numflavors++; ++arg; } else if (!strcmp(argv[arg], "-s")) { do_amshort = 1; numflavors++; ++arg; } else if (!strcmp(argv[arg], "-m")) { do_ammedium = 1; numflavors++; ++arg; } else if (!strcmp(argv[arg], "-l")) { do_amlong = 1; numflavors++; ++arg; } else if (!strcmp(argv[arg], "-b")) { do_bulk = 1; ++arg; } else if (!strcmp(argv[arg], "-n")) { do_nonbulk = 1; ++arg; } else if (!strcmp(argv[arg], "-v")) { do_value = 1; ++arg; } else if (!strcmp(argv[arg], "-i")) { do_implicit = 1; numsync++; ++arg; } else if (!strcmp(argv[arg], "-e")) { do_explicit = 1; numsync++; ++arg; } else if (!strcmp(argv[arg], "-k")) { do_blocking = 1; numsync++; ++arg; } else if (argv[arg][0] == '-') { help = 1; ++arg; } else break; } if (fullduplexmode && firstlastmode) help = 1; if (argc > arg+3) help = 1; if (argc > arg) { iters = atoi(argv[arg]); arg++; } if (!iters) iters = 10; if (argc > arg) { maxdepth = atoi(argv[arg]); arg++; } if (!maxdepth) maxdepth = 1024; /* 1024 default */ if (argc > arg) { maxsz = atoi(argv[arg]); arg++; } if (!maxsz) maxsz = 2*1024*1024; /* 2 MB default */ #ifdef GASNET_SEGMENT_EVERYTHING if (maxsz > TEST_SEGSZ) { MSG("maxsz must be <= %lu on GASNET_SEGMENT_EVERYTHING",(unsigned long)TEST_SEGSZ); gasnet_exit(1); } #endif GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t), TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET)); test_init("testqueue",1,"[-in|-out|-a|-f] (iters) (maxdepth) (maxsz)\n" " The 'in' or 'out' option selects whether the initiator-side\n" " memory is in the GASNet segment or not (default is not).\n" " The -a option enables full-duplex mode, where all nodes send.\n" " The -f option enables 'first/last' mode, where the first node\n" " sends to the last, while all other nodes sit idle.\n" " Test types to run: (defaults to everything)\n" " -p : puts\n" " -g : gets\n" " -s : AMShort\n" " -m : AMMedium\n" " -l : AMLong\n" " -n : Test non-bulk put/gets\n" " -b : Test bulk put/gets\n" " -v : Test value-based put/gets\n" " -i : Test implicit-handle put/gets\n" " -e : Test explicit-handle put/gets\n" " -k : Test blocking put/gets\n"); if (help) test_usage(); min_payload = 1; max_payload = maxsz; if (numflavors == 0) { /* default to all */ do_puts = 1; do_gets = 1; do_amshort = 1; do_ammedium = 1; do_amlong = 1; } if (numsync == 0) { /* default to all */ do_implicit = 1; do_explicit = 1; do_blocking = 1; } if (!do_bulk && !do_nonbulk && !do_value) { do_bulk = 1; do_nonbulk = 1; do_value = 1; } if (!do_implicit && !do_explicit && !do_blocking) { do_implicit = 1; do_explicit = 1; do_blocking = 1; } if (max_payload < min_payload) { printf("ERROR: maxsz must be >= %i\n",min_payload); gasnet_exit(1); } /* get SPMD info */ myproc = gasnet_mynode(); numprocs = gasnet_nodes(); if (!firstlastmode) { /* Only allow 1 or even number for numprocs */ if (numprocs > 1 && numprocs % 2 != 0) { MSG0("WARNING: This test requires a unary or even number of nodes. Test skipped.\n"); gasnet_exit(0); /* exit 0 to prevent false negatives in test harnesses for smp-conduit */ } } /* Setting peer thread rank */ if (firstlastmode) { peerproc = numprocs-1; iamsender = (myproc == 0); iamrecver = (myproc == numprocs-1); multisender = 0; } else if (numprocs == 1) { peerproc = 0; iamsender = 1; iamrecver = 1; multisender = 0; } else { peerproc = (myproc % 2) ? (myproc - 1) : (myproc + 1); iamsender = (fullduplexmode || myproc % 2 == 0); iamrecver = (fullduplexmode || !iamsender); multisender = (fullduplexmode || numprocs >= 4); } multisender = 1; /* messes up output on some systems */ myseg = TEST_SEG(myproc); tgtmem = TEST_SEG(peerproc); if (insegment) { msgbuf = (void *) myseg; } else { alloc = (void *) test_calloc(maxsz+PAGESZ,1); /* use calloc to prevent valgrind warnings */ msgbuf = (void *) alignup(((uintptr_t)alloc), PAGESZ); /* ensure page alignment of base */ } assert(((uintptr_t)msgbuf) % PAGESZ == 0); MSG0("Running %squeue test with local addr %sside segment, iters=%i, maxdepth=%i, sz: %i...%i", firstlastmode ? "first/last " : (fullduplexmode ? "full-duplex ": ""), insegment ? "in" : "out", iters, maxdepth, min_payload, max_payload); MSG0("x-axis: queue depth, y-axis: message size, injection time in microseconds\n"); BARRIER(); handles = (gasnet_handle_t *) test_malloc(sizeof(gasnet_handle_t) * maxdepth); vghandles = (gasnet_valget_handle_t *) test_malloc(sizeof(gasnet_valget_handle_t) * maxdepth); do_bulkputgets(); do_nonbulkputgets(); do_valueputgets(); do_blockingputgets(); do_amtests(); BARRIER(); test_free(handles); if (!insegment) { test_free(alloc); } gasnet_exit(0); return 0; }
int main(int argc, char **argv) { int help=0; int arg=1; GASNET_Safe(gex_Client_Init(&myclient, &myep, &myteam, "testam", &argc, &argv, 0)); mynode = gex_TM_QueryRank(myteam); numnode = gex_TM_QuerySize(myteam); arg = 1; while (argc > arg) { if (!strcmp(argv[arg], "-p")) { #if GASNET_PAR pollers = test_thread_limit(atoi(argv[arg+1])+1)-1; arg += 2; #else if (0 == mynode) { fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING); fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n"); fflush(NULL); } sleep(1); gasnet_exit(1); #endif } else if (!strcmp(argv[arg], "-in")) { insegment = 1; ++arg; } else if (!strcmp(argv[arg], "-out")) { insegment = 0; ++arg; } else if (!strcmp(argv[arg], "-c")) { crossmachinemode = 1; ++arg; } else if (!strcmp(argv[arg], "-sync-req")) { asynclc = 0; lc_opt = GEX_EVENT_NOW; ++arg; } else if (!strcmp(argv[arg], "-async-req")) { asynclc = 1; lc_opt = GEX_EVENT_GROUP; ++arg; } else if (!strcmp(argv[arg], "-fp")) { use_np = 0; ++arg; } else if (!strcmp(argv[arg], "-np-cb")) { use_np = 1; np_cbuf = 1; ++arg; } else if (!strcmp(argv[arg], "-np-gb")) { use_np = 1; np_cbuf = 0; ++arg; } else if (!strcmp(argv[arg], "-src-noop")) { src_mode = SRC_NOOP; ++arg; } else if (!strcmp(argv[arg], "-src-generate")) { src_mode = SRC_GENERATE; ++arg; } else if (!strcmp(argv[arg], "-src-memcpy")) { src_mode = SRC_MEMCPY; ++arg; } else if (argv[arg][0] == '-') { help = 1; ++arg; } else break; } if (argc > arg) { iters = atoi(argv[arg]); ++arg; } if (!iters) iters = 1000; if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; } if (!maxsz) maxsz = 2*1024*1024; if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; } GASNET_Safe(gex_Segment_Attach(&mysegment, myteam, TEST_SEGSZ_REQUEST)); GASNET_Safe(gex_EP_RegisterHandlers(myep, htable, sizeof(htable)/sizeof(gex_AM_Entry_t))); #if GASNET_PAR #define PAR_USAGE \ " The -p option gives the number of polling threads, specified as\n" \ " a non-negative integer argument (default is no polling threads).\n" #else #define PAR_USAGE "" #endif test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n" " The '-in' or '-out' option selects whether the requestor's\n" " buffer is in the GASNet segment or not (default is 'in').\n" PAR_USAGE " The '-sync-req' or '-async-req' option selects synchronous or asynchronous\n" " local completion of Medium and Long Requests (default is synchronous).\n" " The '-fp', '-np-gb' or '-np-cb' option selects Fixed- or Negotiated-Payload\n" " for Medium and Long AMs, as follows:\n" " -fp: Fixed-Payload (default)\n" " -np-gb: Negotiated-Payload with GASNet-provided buffer\n" " -np-cb: Negotiated-Payload with client-provided buffer\n" " The '-src-*' options select treatment of the payload buffer used for\n" " Medium and Long AMs, as follows:\n" " -src-noop: no per-operation initialization (default)\n" " -src-generate: initialized (w/o memory reads) on each AM injection\n" " -src-memcpy: initialized using memcpy() on each AM injection\n" " The -c option enables cross-machine pairing (default is nearest neighbor).\n"); if (help || argc > arg) test_usage(); TEST_PRINT_CONDUITINFO(); if (insegment) { myseg = TEST_MYSEG(); } else { char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ); myseg = alignup_ptr(space, PAGESZ); } if (src_mode == SRC_MEMCPY) { zero_buffer = test_calloc(maxsz, 1); } np_lc_opt = np_cbuf ? lc_opt : NULL; if (crossmachinemode) { if ((numnode%2) && (mynode == numnode-1)) { sender = 1; peer = mynode; } else { gex_Rank_t half = numnode / 2; sender = (mynode < half); peer = sender ? (mynode + half) : (mynode - half); } } else { peer = mynode ^ 1; sender = mynode % 2 == 0; if (peer == numnode) { peer = mynode; } } gex_Event_t *tmp_lc_opt = use_np ? np_lc_opt : lc_opt; gex_Flags_t flags = use_np ? ( np_cbuf ? GEX_FLAG_AM_PREPARE_LEAST_CLIENT : GEX_FLAG_AM_PREPARE_LEAST_ALLOC) : 0; maxmedreq = MIN(maxsz, gex_AM_MaxRequestMedium(myteam,peer,tmp_lc_opt,flags,0)); maxmedrep = MIN(maxsz, gex_AM_MaxReplyMedium (myteam,peer,GEX_EVENT_NOW,flags,0)); maxlongreq = MIN(maxsz, gex_AM_MaxRequestLong (myteam,peer,tmp_lc_opt,flags,0)); maxlongrep = MIN(maxsz, gex_AM_MaxReplyLong (myteam,peer,GEX_EVENT_NOW,flags,0)); recvr = !sender || (peer == mynode); // Long Request and Reply (distinct for loopback) reply_addr = TEST_SEG(peer); request_addr = (peer == mynode) ? (void*)((uintptr_t)reply_addr + alignup(maxsz,SIZEOF_GEX_RMA_VALUE_T)) : reply_addr; BARRIER(); #if GASNET_PAR #define PAR_FMT " %i extra recvr polling threads\n" #define PAR_ARG ,pollers #else #define PAR_FMT /*empty*/ #define PAR_ARG /*empty*/ #endif if (mynode == 0) { printf("Running %i iterations of %s AM performance with:\n" " local addresses %sside the segment%s\n" " %ssynchronous LC for Requests%s\n" " %s\n" " %s\n" PAR_FMT " ...\n", iters, (crossmachinemode ? "cross-machine ": ""), (insegment ? "in" : "out"), (insegment ? " (default)" : ""), (asynclc ? "a": ""), (asynclc ? "": " (default)"), (!use_np ? "fixed-Payload (default)" :(np_cbuf ? "negotiated-Payload with client-provided buffer" : "negotiated-Payload with GASNet-provided buffer")), ((src_mode == SRC_NOOP) ? "no payload initialization (default)" :(src_mode == SRC_GENERATE) ? "payload initialized by computation" : "payload initialized using memcpy()") PAR_ARG ); printf(" Msg Sz Description Total time Avg. time Bandwidth\n" " ------ ----------- ---------- --------- ---------\n"); fflush(stdout); } #if GASNET_PAR TEST_SET_WAITMODE(pollers+1); if (pollers) test_createandjoin_pthreads(pollers+1,doAll,NULL,0); else #endif doAll(NULL); MSG("done."); gasnet_exit(0); return 0; }
int main(int argc, char **argv) { int help=0; int arg=1; gasnet_handlerentry_t htable[] = { { hidx_ping_shorthandler, ping_shorthandler }, { hidx_pong_shorthandler, pong_shorthandler }, { hidx_ping_medhandler, ping_medhandler }, { hidx_pong_medhandler, pong_medhandler }, { hidx_ping_longhandler, ping_longhandler }, { hidx_pong_longhandler, pong_longhandler }, { hidx_ping_shorthandler_flood, ping_shorthandler_flood }, { hidx_pong_shorthandler_flood, pong_shorthandler_flood }, { hidx_ping_medhandler_flood, ping_medhandler_flood }, { hidx_pong_medhandler_flood, pong_medhandler_flood }, { hidx_ping_longhandler_flood, ping_longhandler_flood }, { hidx_pong_longhandler_flood, pong_longhandler_flood }, { hidx_done_shorthandler, done_shorthandler } }; GASNET_Safe(gasnet_init(&argc, &argv)); mynode = gasnet_mynode(); numnode = gasnet_nodes(); arg = 1; while (argc > arg) { if (!strcmp(argv[arg], "-p")) { #if GASNET_PAR pollers = atoi(argv[arg+1]); arg += 2; #else if (0 == mynode) { fprintf(stderr, "testam %s\n", GASNET_CONFIG_STRING); fprintf(stderr, "ERROR: The -p option is only available in the PAR configuration.\n"); fflush(NULL); } sleep(1); gasnet_exit(1); #endif } else if (!strcmp(argv[arg], "-in")) { insegment = 1; ++arg; } else if (!strcmp(argv[arg], "-out")) { insegment = 0; ++arg; } else if (!strcmp(argv[arg], "-c")) { crossmachinemode = 1; ++arg; } else if (argv[arg][0] == '-') { help = 1; ++arg; } else break; } if (argc > arg) { iters = atoi(argv[arg]); ++arg; } if (!iters) iters = 1000; if (argc > arg) { maxsz = atoi(argv[arg]); ++arg; } if (!maxsz) maxsz = 2*1024*1024; if (argc > arg) { TEST_SECTION_PARSE(argv[arg]); ++arg; } GASNET_Safe(gasnet_attach(htable, sizeof(htable)/sizeof(gasnet_handlerentry_t), TEST_SEGSZ_REQUEST, TEST_MINHEAPOFFSET)); #if GASNET_PAR test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n" " The '-in' or '-out' option selects whether the requestor's\n" " buffer is in the GASNet segment or not (default is 'in').\n" " The -p option gives the number of polling threads, specified as\n" " a non-negative integer argument (default is no polling threads).\n" " The -c option enables cross-machine pairing (default is nearest neighbor).\n"); #else test_init("testam", 1, "[options] (iters) (maxsz) (test_sections)\n" " The '-in' or '-out' option selects whether the requestor's\n" " buffer is in the GASNet segment or not (default is 'in').\n" " The -c option enables cross-machine pairing (default is nearest neighbor).\n"); #endif if (help || argc > arg) test_usage(); TEST_PRINT_CONDUITINFO(); if (insegment) { myseg = TEST_MYSEG(); } else { char *space = test_malloc(alignup(maxsz,PAGESZ) + PAGESZ); myseg = alignup_ptr(space, PAGESZ); } maxmed = MIN(maxsz, gasnet_AMMaxMedium()); maxlongreq = MIN(maxsz, gasnet_AMMaxLongRequest()); maxlongrep = MIN(maxsz, gasnet_AMMaxLongReply()); if (crossmachinemode) { if ((numnode%2) && (mynode == numnode-1)) { sender = 1; peer = mynode; } else { gasnet_node_t half = numnode / 2; sender = (mynode < half); peer = sender ? (mynode + half) : (mynode - half); } } else { peer = mynode ^ 1; sender = mynode % 2 == 0; if (peer == numnode) { peer = mynode; } } recvr = !sender || (peer == mynode); peerseg = TEST_SEG(peer); BARRIER(); if (mynode == 0) { printf("Running %sAM performance test with %i iterations" #if GASNET_PAR " and %i extra recvr polling threads" #endif "...\n", (crossmachinemode ? "cross-machine ": ""), iters #if GASNET_PAR ,pollers #endif ); printf(" Msg Sz Description Total time Avg. time Bandwidth\n" " ------ ----------- ---------- --------- ---------\n"); fflush(stdout); } #if GASNET_PAR if (pollers) test_createandjoin_pthreads(pollers+1,doAll,NULL,0); else #endif doAll(NULL); MSG("done."); gasnet_exit(0); return 0; }
void scan( backend::command_queue const &queue, backend::device_vector<T> const &input, backend::device_vector<T> &output, T init, bool exclusive, Oper ) { precondition( input.size() == output.size(), "Wrong output size in inclusive_scan" ); backend::select_context(queue); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue) ? NT_cpu : NT_gpu; const int NT2 = 2 * NT; int do_exclusive = exclusive ? 1 : 0; const size_t count = input.size(); const size_t num_blocks = (count + NT2 - 1) / NT2; const size_t scan_buf_size = alignup(num_blocks, NT2); backend::device_vector<T> pre_sum1(queue, scan_buf_size); backend::device_vector<T> pre_sum2(queue, scan_buf_size); backend::device_vector<T> post_sum(queue, scan_buf_size); // Kernel0 auto krn0 = is_cpu(queue) ? block_inclusive_scan<NT_cpu, T, Oper>(queue) : block_inclusive_scan<NT_gpu, T, Oper>(queue); krn0.push_arg(count); krn0.push_arg(input); krn0.push_arg(init); krn0.push_arg(pre_sum1); krn0.push_arg(pre_sum2); krn0.push_arg(do_exclusive); krn0.config(num_blocks, NT); krn0(queue); // Kernel1 auto krn1 = is_cpu(queue) ? intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) : intra_block_inclusive_scan<NT_gpu, T, Oper>(queue); uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); krn1.push_arg(num_blocks); krn1.push_arg(post_sum); krn1.push_arg(pre_sum1); krn1.push_arg(init); krn1.push_arg(work_per_thread); krn1.config(1, NT); krn1(queue); // Kernel2 auto krn2 = is_cpu(queue) ? block_addition<NT_cpu, T, Oper>(queue) : block_addition<NT_gpu, T, Oper>(queue); krn2.push_arg(count); krn2.push_arg(input); krn2.push_arg(output); krn2.push_arg(post_sum); krn2.push_arg(pre_sum2); krn2.push_arg(init); krn2.push_arg(do_exclusive); krn2.config(num_blocks * 2, NT); krn2(queue); }