void susan_responses(T* out, const T* in, const unsigned idim0, const unsigned idim1, const int radius, const float t, const float g, const unsigned edge) { dim3 threads(BLOCK_X, BLOCK_Y); dim3 blocks(divup(idim0 - edge * 2, BLOCK_X), divup(idim1 - edge * 2, BLOCK_Y)); const size_t SMEM_SIZE = (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T); CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in, idim0, idim1, radius, t, g, edge); POST_LAUNCH_CHECK(); }
void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim3 threads(THREADS_X, THREADS_Y); dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.y); CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void EntryData::newSheet() { int newMax = 0; // I think we can actually assume the last block is on the last page, // but I am going to be bloody minded about it. foreach (BlockData *b, blocks()) { if (b->lastSheet()>newMax) newMax = b->lastSheet(); } bool doEmit = maxSheet>=0; if (newMax != maxSheet) { maxSheet = newMax; if (doEmit) emit sheetCountMod(); } }
void EntryData::loadMore(QVariantMap const &src) { Data::loadMore(src); TitleData *title_ = firstChild<TitleData>(); // Any old title has already been destructed by Data's loadChildren() ASSERT(title_); connect(title_, SIGNAL(textMod()), SIGNAL(titleMod())); maxSheet = 0; foreach (BlockData *b, blocks()) { if (b->lastSheet() > maxSheet) maxSheet = b->lastSheet(); connect(b, SIGNAL(newSheet(int)), SLOT(newSheet())); connect(b, SIGNAL(sheetCountMod(int)), SLOT(newSheet())); } }
void Bk::setDataInMongoLocal() { if (m_Order > 0) { delete *m_pBlockForReadFileIter; m_pBlockForReadFileIter++; } Local::Files file(m_pMongoLocal); file.InsertInfo(m_FileIdGlobalDb, (*m_pBlockForReadFileIter)->getBlockId(), (*m_pBlockForReadFileIter)->getOffset(), (*m_pBlockForReadFileIter)->getLength(), m_Order++); Local::Blocks blocks(m_pMongoLocal); blocks.UpdateInfo((*m_pBlockForReadFileIter)->getBlockId(), (*m_pBlockForReadFileIter)->getLength()); }
void exampleFunc(Param<T> out, CParam<T> in, const af_someenum_t p) { dim3 threads(TX, TY, 1); // set your cuda launch config for blocks dim_type blk_x = divup(out.dims[0], threads.x); dim_type blk_y = divup(out.dims[1], threads.y); dim3 blocks(blk_x, blk_y); // set your opencl launch config for grid // launch your kernel exampleFuncKernel<T> <<<blocks, threads>>> (out, in, p); POST_LAUNCH_CHECK(); // Macro for post kernel launch checks // these checks are carried ONLY IN DEBUG mode }
bool isValidSudoku(vector<vector<char> > &board) { vector<vector<bool> > rows(9,vector<bool>(9,false)); vector<vector<bool> > columns(9,vector<bool>(9,false)); vector<vector<bool> > blocks(9,vector<bool>(9,false)); for(int i = 0;i < 9;i++) for(int j = 0;j < 9;j++){ if(board[i][j] == '.') continue; int c = board[i][j] - '1'; if(rows[i][c] || columns[j][c] || blocks[i / 3 * 3 + j / 3][c]) return false; rows[i][c] = columns[j][c] = blocks[i / 3 * 3 + j / 3][c] = true; } return true; }
void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(out.dims[0], TILEX); int blocksPerMatY = divup(out.dims[1], TILEY); dim3 blocks(blocksPerMatX * out.dims[2], blocksPerMatY * out.dims[3], 1); CUDA_LAUNCH((iota_kernel<T>), blocks, threads, out, sdims[0], sdims[1], sdims[2], sdims[3], tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
int main(int argc, char *argv[]) { Pooma::initialize(argc,argv); Pooma::Tester tester(argc,argv); int i; // Create the total domain. Interval<1> domain(12); // Create the block sizes. Loc<1> blocks(3), blocks2(4); // Create the partitioners. UniformGridPartition<1> partition(blocks), partition2(blocks2); // Create the layouts. UniformGridLayout<1> layout(domain, partition, ReplicatedTag()); UniformGridLayout<1> layout2(domain, partition2, ReplicatedTag()); // Make some UMP arrays and fill them. Array<1, double, Brick > a(12), ans(12); Array<1, double, MultiPatch<UniformTag,Brick> > bb(layout), cc(layout2); for (i = 0; i < 12; i++) { bb(i) = 1.0 + i; cc(i) = -2.3 * i; ans(i) = bb(i) + 3.0 * cc(i); } a = bb + 3.0 * cc; Pooma::blockAndEvaluate(); for (i = 0; i < 12; i++) { tester.check(a(i) == ans(i)); } int ret = tester.results("ump_test2"); Pooma::finalize(); return ret; }
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size) { const dim3 threads(THREADS_X, THREADS_Y); int blk_x = divup(in.dims[0], threads.x); int blk_y = divup(in.dims[1], threads.y); dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]); //TODO: add more cases when 5x5 and 7x7 kernels are done switch(ker_size) { case 3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break; } POST_LAUNCH_CHECK(); }
InputParameters validParams<Action>() { InputParameters params; std::vector<std::string> blocks(1); blocks[0] = "__all__"; // Add the "active" parameter to all blocks to support selective child visitation (turn blocks on and off without comments) params.addParam<std::vector<std::string> >("active", blocks, "If specified only the blocks named will be visited and made active"); params.addPrivateParam<std::string>("_action_name"); // the name passed to ActionFactory::create params.addPrivateParam<std::string>("task"); params.addPrivateParam<std::string>("registered_identifier"); params.addPrivateParam<std::string>("action_type"); params.addPrivateParam<ActionWarehouse *>("awh", NULL); return params; }
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) { dim3 threads(TX, TY, 1); int blocksPerMatX = divup(in.dims[0], TX); int blocksPerMatY = divup(in.dims[1], TY); dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1); const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; blocks.z = divup(blocks.y, maxBlocksY); blocks.y = divup(blocks.y, blocks.z); CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in, blocksPerMatX, blocksPerMatY); POST_LAUNCH_CHECK(); }
Board getCopy() { std::vector< std::vector<int> > blocks (N); int positionPointer = 0; for (int i = 0; i < N; ++i) { std::vector<int> row (N); for (int j = 0; j < N; ++j) row[j] = position[positionPointer++]; blocks[i] = row; } Board newBoard (blocks); return newBoard; }
bool ResourcePool::lockBlocks( DWORD moduleSize ) { bool result=false; DWORD need = blocks( moduleSize ); _mFreeBuffers.lock(); if (need < availables()) { _locked += need; result=true; } // printf( "[ResourcePool] lock: id=%d max=%ld, allocated=%ld, free=%d, locked=%ld, availables=%ld, need=%ld, result=%d\n", // _id, _max, _allocated, _freeBuffers.size(), _locked, availables(), need, result ); _mFreeBuffers.unlock(); return result; }
void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy, const dim_t sx, const dim_t sy, const dim_t px, const dim_t py, const dim_t nx) { dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0])); dim3 threads(TX, THREADS_PER_BLOCK / TX); dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]); dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256 CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads, out, in, wx, wy, sx, sy, px, py, nx, reps); POST_LAUNCH_CHECK(); }
void rotate(Param<T> out, CParam<T> in, const float theta) { const float c = cos(-theta), s = sin(-theta); float tx, ty; { const float nx = 0.5 * (in.dims[0] - 1); const float ny = 0.5 * (in.dims[1] - 1); const float mx = 0.5 * (out.dims[0] - 1); const float my = 0.5 * (out.dims[1] - 1); const float sx = (mx * c + my *-s); const float sy = (mx * s + my * c); tx = -(sx - nx); ty = -(sy - ny); } // Rounding error. Anything more than 3 decimal points wont make a diff tmat_t t; t.tmat[0] = round( c * 1000) / 1000.0f; t.tmat[1] = round(-s * 1000) / 1000.0f; t.tmat[2] = round(tx * 1000) / 1000.0f; t.tmat[3] = round( s * 1000) / 1000.0f; t.tmat[4] = round( c * 1000) / 1000.0f; t.tmat[5] = round(ty * 1000) / 1000.0f; int nimages = in.dims[2]; int nbatches = in.dims[3]; dim3 threads(TX, TY, 1); dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y)); const int blocksXPerImage = blocks.x; const int blocksYPerImage = blocks.y; if(nimages > TI) { int tile_images = divup(nimages, TI); nimages = TI; blocks.x = blocks.x * tile_images; } blocks.y = blocks.y * nbatches; CUDA_LAUNCH((rotate_kernel<T, method>), blocks, threads, out, in, t, nimages, nbatches, blocksXPerImage, blocksYPerImage); POST_LAUNCH_CHECK(); }
/// initializes the operand stack state for the given bytecode index, typically called at the beginning of a basic block /// returns the bytecode index provided. virtual int32_t setupBBStartContext(int32_t index) { if (_stacks[index] != NULL) { *_stack = *_stacks[index]; _stackTemps = *_stacks[index]; } else { if (_stack) _stack->clear(); _stackTemps.clear(); } _block = blocks(index); return index; }
static void bcast_first_launcher(Param<To> out, CParam<To> tmp, const uint blocks_x, const uint blocks_y, const uint threads_x) { dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x); dim3 blocks(blocks_x * out.dims[2], blocks_y * out.dims[3]); uint lim = divup(out.dims[0], (threads_x * blocks_x)); CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim); POST_LAUNCH_CHECK(); }
// uri_is_blocked provides a thin wrapper around the block RPC call, to // deal with errors and provide the page uri. static gboolean uri_is_blocked(const char *uri, guint64 flags, Exten *exten) { GError *err = NULL; gboolean ret = blocks( uri, webkit_web_page_get_uri(exten->web_page), flags, exten, &err); if(err != NULL) { printf("Failed to check if uri is blocked: %s\n", err->message); g_error_free(err); return false; } return ret; }
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval) { dim3 threads(kernel::THREADS_X, 1); int nElems = in.dims[0] * in.dims[1]; int blk_x = divup(nElems, THRD_LOAD*THREADS_X); dim3 blocks(blk_x * in.dims[2], in.dims[3]); // If nbins > MAX_BINS, we are using global memory so smem_size can be 0; int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0; CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size, out, in, nElems, nbins, minval, maxval, blk_x); POST_LAUNCH_CHECK(); }
std::string picker_log_alert::message() const { static char const* const flag_names[] = { "partial_ratio ", "prioritize_partials ", "rarest_first_partials ", "rarest_first ", "reverse_rarest_first ", "suggested_pieces ", "prio_sequential_pieces ", "sequential_pieces ", "reverse_pieces ", "time_critical ", "random_pieces ", "prefer_contiguous ", "reverse_sequential ", "backup1 ", "backup2 ", "end_game " }; std::string ret = peer_alert::message(); boost::uint32_t flags = picker_flags; int idx = 0; ret += " picker_log [ "; for (; flags != 0; flags >>= 1, ++idx) { if ((flags & 1) == 0) continue; ret += flag_names[idx]; } ret += "] "; std::vector<piece_block> b = blocks(); for (int i = 0; i < int(b.size()); ++i) { char buf[50]; snprintf(buf, sizeof(buf), "(%d,%d) " , b[i].piece_index, b[i].block_index); ret += buf; } return ret; }
static void bcast_dim_launcher(Param<To> out, CParam<To> tmp, const uint threads_y, const uint blocks_all[4]) { dim3 threads(THREADS_X, threads_y); dim3 blocks(blocks_all[0] * blocks_all[2], blocks_all[1] * blocks_all[3]); uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim])); CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads, out, tmp, blocks_all[0], blocks_all[1], blocks_all[dim], lim); POST_LAUNCH_CHECK(); }
bool isValidSudoku2(vector<vector<char> > &board) {//极其巧妙的方法,慢慢消化 // Start typing your C/C++ solution below // DO NOT write int main() function vector<vector<bool> > rows(9, vector<bool>(9, false)); vector<vector<bool> > cols(9, vector<bool>(9, false)); vector<vector<bool> > blocks(9, vector<bool>(9, false)); for (int i = 0; i < 9; ++i) { for (int j = 0; j < 9; ++j) { if (board[i][j] == '.') continue; int c = board[i][j] - '1'; if (rows[i][c] || cols[j][c] || blocks[i - i % 3 + j / 3][c]) return false; rows[i][c] = cols[j][c] = blocks[i - i % 3 + j / 3][c] = true; } } return true; }
shared_ptr<ChunkModelResult> compute_chunk(const ChunkModelData &data, const BlockTypeInfo &block_data) { std::vector<BlockData> blocks(XZ_SIZE * XZ_SIZE * XZ_SIZE); std::vector<char> highest(XZ_SIZE * XZ_SIZE); BlockData *above = data.above->blocks; BlockData *below = data.below->blocks; BlockData *left = data.left->blocks; BlockData *right = data.right->blocks; BlockData *front = data.front->blocks; BlockData *back = data.back->blocks; BlockData *above_left = data.above_left->blocks; BlockData *above_right = data.above_right->blocks; BlockData *above_front = data.above_front->blocks; BlockData *above_back = data.above_back->blocks; BlockData *above_left_front = data.above_left_front->blocks; BlockData *above_right_front = data.above_right_front->blocks; BlockData *above_left_back = data.above_left_back->blocks; BlockData *above_right_back = data.above_right_back->blocks; BlockData *left_front = data.left_front->blocks; BlockData *right_front = data.right_front->blocks; BlockData *left_back = data.left_back->blocks; BlockData *right_back = data.right_back->blocks; const char *is_transparent = block_data.is_transparent; const char *is_plant = block_data.is_plant; const char *state = block_data.state; int ox = - CHUNK_SIZE - 1; int oy = - CHUNK_SIZE - 1; int oz = - CHUNK_SIZE - 1; /* Populate the blocks array with the chunk itself */ const BlockData *self = data.self->blocks; CHUNK_FOR_EACH(self, ex, ey, ez, eb) { int x = ex - ox; int y = ey - oy; int z = ez - oz; blocks[XYZ(x, y, z)] = eb; if (!is_transparent[eb.type]) { highest[XZ(x, z)] = std::max((int)highest[XZ(x, z)], y); } } END_CHUNK_FOR_EACH;
void visit(TransID tid) { auto tidRegion = m_profData->transRegion(tid); auto tidInstrs = tidRegion->instrSize(); if (m_numBCInstrs + tidInstrs > RuntimeOption::EvalJitMaxRegionInstrs) { return; } if (m_visited.count(tid)) return; m_visited.insert(tid); m_visiting.insert(tid); if (!breaksRegion(*(m_profData->transLastInstr(tid)))) { auto srcBlockId = tidRegion->blocks().back().get()->id(); for (auto const arc : m_cfg.outArcs(tid)) { auto dst = arc->dst(); // If dst is in the visiting set then this arc forms a cycle. Don't // include it unless we've asked for loops. if (!RuntimeOption::EvalJitLoops && m_visiting.count(dst)) continue; // Skip dst if we already generated a region starting at that SrcKey. auto dstSK = m_profData->transSrcKey(dst); if (m_profData->optimized(dstSK)) continue; auto dstBlockId = m_profData->transRegion(dst)->entry()->id(); m_arcs.push_back({srcBlockId, dstBlockId}); visit(dst); } } // Now insert the region for tid in the front of m_region. We do // this last so that the region ends up in (quasi-)topological order // (it'll be in topological order for acyclic regions). m_region->prepend(*tidRegion); m_selectedSet.insert(tid); if (m_selectedVec) m_selectedVec->push_back(tid); m_numBCInstrs += tidRegion->instrSize(); always_assert(m_numBCInstrs <= RuntimeOption::EvalJitMaxRegionInstrs); m_visiting.erase(tid); }
void ResourcePool::unlockBlocks( DWORD moduleSize ) { DWORD need = blocks( moduleSize ); // printf( "[ResourcePool] unlock: id=%d max=%ld, allocated=%ld, free=%d, locked=%ld, availables=%ld, need=%ld\n", // _id, _max, _allocated, _freeBuffers.size(), _locked, availables(), need ); _mFreeBuffers.lock(); if (_locked >= need) { _locked -= need; } else { printf( "[ResourcePool] Warning, unlock blocks error: id=%d, need=%d, $locked=%d\n", _id, need, _locked ); assert(false); _locked = 0; } _mFreeBuffers.unlock(); _cWakeup.notify_all(); }
void nonMaximal(float* x_out, float* y_out, float* resp_out, unsigned* count, const unsigned idim0, const unsigned idim1, const T * resp_in, const unsigned edge, const unsigned max_corners) { dim3 threads(BLOCK_X, BLOCK_Y); dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y)); unsigned* d_corners_found = memAlloc<unsigned>(1); CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned), cuda::getStream(cuda::getActiveDeviceId()))); CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads, x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners); POST_LAUNCH_CHECK(); CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost)); memFree(d_corners_found); }
int LeaseManager::renew(const int32_t timeout_ms, const int32_t who) { int ret = TFS_SUCCESS; DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance(); DsRenewLeaseMessage req_msg; req_msg.set_ds_stat(ds_info.information_); if (is_master(who)) { BlockInfoV2* block_infos = req_msg.get_block_infos(); ArrayHelper<BlockInfoV2> blocks(MAX_WRITABLE_BLOCK_COUNT, block_infos); get_writable_block_manager().get_blocks(blocks, BLOCK_WRITABLE); req_msg.set_size(blocks.get_array_index()); } tbnet::Packet* ret_msg = NULL; NewClient* new_client = NewClientManager::get_instance().create_client(); ret = (NULL != new_client) ? TFS_SUCCESS : EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR; if (TFS_SUCCESS == ret) { ret = send_msg_to_server(ns_ip_port_[who], new_client, &req_msg, ret_msg, timeout_ms); if (TFS_SUCCESS == ret) { if (DS_RENEW_LEASE_RESPONSE_MESSAGE == ret_msg->getPCode()) { DsRenewLeaseResponseMessage* resp_msg = dynamic_cast<DsRenewLeaseResponseMessage* >(ret_msg); process_renew_response(resp_msg, who); } else if (STATUS_MESSAGE == ret_msg->getPCode()) { StatusMessage* resp_msg = dynamic_cast<StatusMessage*>(ret_msg); ret = resp_msg->get_status(); } else { ret = EXIT_UNKNOWN_MSGTYPE; } } NewClientManager::get_instance().destroy_client(new_client); } return ret; }
void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b, int ndims) { dim3 threads(DIMX, DIMY); if (ndims == 1) { threads.x *= threads.y; threads.y = 1; } int blk_x = divup(out.dims[0], threads.x); int blk_y = divup(out.dims[1], threads.y); dim3 blocks(blk_x * threads.x, blk_y * threads.y); CUDA_LAUNCH((select_scalar_kernel<T, flip>), blocks, threads, out, cond, a, scalar<T>(b), blk_x, blk_y); }
void transpose(Param<T> out, CParam<T> in, const int ndims) { // dimensions passed to this function should be input dimensions // any necessary transformations and dimension related calculations are // carried out here and inside the kernel dim3 threads(kernel::THREADS_X,kernel::THREADS_Y); int blk_x = divup(in.dims[0],TILE_DIM); int blk_y = divup(in.dims[1],TILE_DIM); // launch batch * blk_x blocks along x dimension dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]); if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0) CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y); else CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y); POST_LAUNCH_CHECK(); }