예제 #1
0
파일: susan.hpp 프로젝트: 9prady9/arrayfire
void susan_responses(T* out, const T* in, const unsigned idim0,
                     const unsigned idim1, const int radius, const float t,
                     const float g, const unsigned edge) {
    dim3 threads(BLOCK_X, BLOCK_Y);
    dim3 blocks(divup(idim0 - edge * 2, BLOCK_X),
                divup(idim1 - edge * 2, BLOCK_Y));
    const size_t SMEM_SIZE =
        (BLOCK_X + 2 * radius) * (BLOCK_Y + 2 * radius) * sizeof(T);

    CUDA_LAUNCH_SMEM((susanKernel<T>), blocks, threads, SMEM_SIZE, out, in,
                     idim0, idim1, radius, t, g, edge);

    POST_LAUNCH_CHECK();
}
예제 #2
0
        void unwrap_row(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim3 threads(THREADS_X, THREADS_Y);
            dim3 blocks(divup(out.dims[0], threads.x), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.y);

            CUDA_LAUNCH((unwrap_kernel<T, false>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
예제 #3
0
파일: EntryData.cpp 프로젝트: wagenadl/eln
void EntryData::newSheet() {
  int newMax = 0;
  // I think we can actually assume the last block is on the last page,
  // but I am going to be bloody minded about it.
  foreach (BlockData *b, blocks()) {
    if (b->lastSheet()>newMax)
      newMax = b->lastSheet();
  }
  bool doEmit = maxSheet>=0;
  if (newMax != maxSheet) {
    maxSheet = newMax;
    if (doEmit)
      emit sheetCountMod();
  }
}  
예제 #4
0
파일: EntryData.cpp 프로젝트: wagenadl/eln
void EntryData::loadMore(QVariantMap const &src) {
  Data::loadMore(src);
  TitleData *title_ = firstChild<TitleData>();
  // Any old title has already been destructed by Data's loadChildren()
  ASSERT(title_);
  connect(title_, SIGNAL(textMod()), SIGNAL(titleMod()));

  maxSheet = 0;
  foreach (BlockData *b, blocks()) {
    if (b->lastSheet() > maxSheet)
      maxSheet = b->lastSheet();
    connect(b, SIGNAL(newSheet(int)), SLOT(newSheet()));
    connect(b, SIGNAL(sheetCountMod(int)), SLOT(newSheet()));
  }
}
예제 #5
0
void Bk::setDataInMongoLocal() {
    if (m_Order > 0) {
        delete *m_pBlockForReadFileIter;
        m_pBlockForReadFileIter++;
    }
    Local::Files file(m_pMongoLocal);
    file.InsertInfo(m_FileIdGlobalDb, (*m_pBlockForReadFileIter)->getBlockId(),
                    (*m_pBlockForReadFileIter)->getOffset(),
                    (*m_pBlockForReadFileIter)->getLength(),
                    m_Order++);

    Local::Blocks blocks(m_pMongoLocal);
    blocks.UpdateInfo((*m_pBlockForReadFileIter)->getBlockId(),
                      (*m_pBlockForReadFileIter)->getLength());
}
예제 #6
0
void exampleFunc(Param<T> out, CParam<T> in, const af_someenum_t p)
{

    dim3 threads(TX, TY, 1);            // set your cuda launch config for blocks

    dim_type blk_x = divup(out.dims[0], threads.x);
    dim_type blk_y = divup(out.dims[1], threads.y);
    dim3 blocks(blk_x, blk_y);          // set your opencl launch config for grid
                            
    // launch your kernel
    exampleFuncKernel<T> <<<blocks, threads>>> (out, in, p);

    POST_LAUNCH_CHECK();                // Macro for post kernel launch checks
                                        // these checks are carried  ONLY IN DEBUG mode
}
예제 #7
0
파일: ValidSuduku.cpp 프로젝트: gemire/code
 bool isValidSudoku(vector<vector<char> > &board) {
     vector<vector<bool> > rows(9,vector<bool>(9,false));
     vector<vector<bool> > columns(9,vector<bool>(9,false));
     vector<vector<bool> > blocks(9,vector<bool>(9,false));
     for(int i = 0;i < 9;i++)
         for(int j = 0;j < 9;j++){
             if(board[i][j] == '.')
                 continue;
             int c = board[i][j] - '1';
             if(rows[i][c] || columns[j][c] || blocks[i / 3 * 3 + j / 3][c])
                 return false;
             rows[i][c] = columns[j][c] = blocks[i / 3 * 3 + j / 3][c] = true;
         }
     return true;
 }
예제 #8
0
        void iota(Param<T> out, const dim4 &sdims, const dim4 &tdims)
        {
            dim3 threads(TX, TY, 1);

            int blocksPerMatX = divup(out.dims[0], TILEX);
            int blocksPerMatY = divup(out.dims[1], TILEY);
            dim3 blocks(blocksPerMatX * out.dims[2],
                        blocksPerMatY * out.dims[3],
                        1);

            CUDA_LAUNCH((iota_kernel<T>), blocks, threads,
                    out, sdims[0], sdims[1], sdims[2], sdims[3],
                    tdims[0], tdims[1], tdims[2], tdims[3], blocksPerMatX, blocksPerMatY);
            POST_LAUNCH_CHECK();
        }
예제 #9
0
int main(int argc, char *argv[])
{
  Pooma::initialize(argc,argv);
  Pooma::Tester tester(argc,argv);

  int i;
  
  // Create the total domain.
  
  Interval<1> domain(12);
  
  // Create the block sizes.
  
  Loc<1> blocks(3), blocks2(4);

  // Create the partitioners.
  
  UniformGridPartition<1> partition(blocks), partition2(blocks2);
  
  // Create the layouts.
  
  UniformGridLayout<1> layout(domain, partition, ReplicatedTag());
  UniformGridLayout<1> layout2(domain, partition2, ReplicatedTag());
  
  // Make some UMP arrays and fill them.
  
  Array<1, double, Brick > a(12), ans(12);
  Array<1, double, MultiPatch<UniformTag,Brick> > bb(layout), cc(layout2);
  for (i = 0; i < 12; i++)
    {
      bb(i) = 1.0 + i;
      cc(i) = -2.3 * i;
      ans(i) = bb(i) + 3.0 * cc(i);
    }
  
  a = bb + 3.0 * cc;

  Pooma::blockAndEvaluate();

  for (i = 0; i < 12; i++)
    {
      tester.check(a(i) == ans(i));
    }
  
  int ret = tester.results("ump_test2");
  Pooma::finalize();
  return ret;
}
예제 #10
0
void sobel(Param<To> dx, Param<To> dy, CParam<Ti> in, const unsigned &ker_size)
{
    const dim3 threads(THREADS_X, THREADS_Y);

    int blk_x = divup(in.dims[0], threads.x);
    int blk_y = divup(in.dims[1], threads.y);

    dim3 blocks(blk_x*in.dims[2], blk_y*in.dims[3]);

    //TODO: add more cases when 5x5 and 7x7 kernels are done
    switch(ker_size) {
        case  3: CUDA_LAUNCH((sobel3x3<Ti, To>), blocks, threads, dx, dy, in, blk_x, blk_y); break;
    }

    POST_LAUNCH_CHECK();
}
예제 #11
0
파일: Action.C 프로젝트: AhmedAly83/moose
InputParameters validParams<Action>()
{
  InputParameters params;
  std::vector<std::string> blocks(1);
  blocks[0] = "__all__";
  // Add the "active" parameter to all blocks to support selective child visitation (turn blocks on and off without comments)
  params.addParam<std::vector<std::string> >("active", blocks, "If specified only the blocks named will be visited and made active");

  params.addPrivateParam<std::string>("_action_name"); // the name passed to ActionFactory::create
  params.addPrivateParam<std::string>("task");
  params.addPrivateParam<std::string>("registered_identifier");
  params.addPrivateParam<std::string>("action_type");
  params.addPrivateParam<ActionWarehouse *>("awh", NULL);

  return params;
}
예제 #12
0
void gradient(Param<T> grad0, Param<T> grad1, CParam<T> in) {
    dim3 threads(TX, TY, 1);

    int blocksPerMatX = divup(in.dims[0], TX);
    int blocksPerMatY = divup(in.dims[1], TY);
    dim3 blocks(blocksPerMatX * in.dims[2], blocksPerMatY * in.dims[3], 1);

    const int maxBlocksY =
        cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
    blocks.z = divup(blocks.y, maxBlocksY);
    blocks.y = divup(blocks.y, blocks.z);

    CUDA_LAUNCH((gradient_kernel<T>), blocks, threads, grad0, grad1, in,
                blocksPerMatX, blocksPerMatY);
    POST_LAUNCH_CHECK();
}
예제 #13
0
 Board getCopy()
 {
     std::vector< std::vector<int> > blocks (N);
     int positionPointer = 0;
     for (int i = 0; i < N; ++i)
     {
         std::vector<int> row (N);
         for (int j = 0; j < N; ++j)
             row[j] = position[positionPointer++];
         blocks[i] = row;
         
     }
     
     Board newBoard (blocks);
     return newBoard;
 }
예제 #14
0
bool ResourcePool::lockBlocks( DWORD moduleSize ) {
    bool result=false;
    DWORD need = blocks( moduleSize );

    _mFreeBuffers.lock();
    if (need < availables()) {
        _locked += need;
        result=true;
    }
	// printf( "[ResourcePool] lock: id=%d max=%ld, allocated=%ld, free=%d, locked=%ld, availables=%ld, need=%ld, result=%d\n",
	// 	_id, _max, _allocated, _freeBuffers.size(), _locked, availables(), need, result );
	
    _mFreeBuffers.unlock();

    return result;
}
예제 #15
0
        void unwrap_col(Param<T> out, CParam<T> in, const dim_t wx, const dim_t wy,
                        const dim_t sx, const dim_t sy,
                        const dim_t px, const dim_t py, const dim_t nx)
        {
            dim_t TX = std::min(THREADS_PER_BLOCK, nextpow2(out.dims[0]));

            dim3 threads(TX, THREADS_PER_BLOCK / TX);
            dim3 blocks(divup(out.dims[1], threads.y), out.dims[2] * out.dims[3]);

            dim_t reps = divup((wx * wy), threads.x); // is > 1 only when TX == 256 && wx * wy > 256

            CUDA_LAUNCH((unwrap_kernel<T, true>), blocks, threads,
                        out, in, wx, wy, sx, sy, px, py, nx, reps);

            POST_LAUNCH_CHECK();
        }
예제 #16
0
파일: rotate.hpp 프로젝트: hxiaox/arrayfire
        void rotate(Param<T> out, CParam<T> in, const float theta)
        {
            const float c = cos(-theta), s = sin(-theta);
            float tx, ty;
            {
                const float nx = 0.5 * (in.dims[0] - 1);
                const float ny = 0.5 * (in.dims[1] - 1);
                const float mx = 0.5 * (out.dims[0] - 1);
                const float my = 0.5 * (out.dims[1] - 1);
                const float sx = (mx * c + my *-s);
                const float sy = (mx * s + my * c);
                tx = -(sx - nx);
                ty = -(sy - ny);
            }

            // Rounding error. Anything more than 3 decimal points wont make a diff
            tmat_t t;
            t.tmat[0] = round( c * 1000) / 1000.0f;
            t.tmat[1] = round(-s * 1000) / 1000.0f;
            t.tmat[2] = round(tx * 1000) / 1000.0f;
            t.tmat[3] = round( s * 1000) / 1000.0f;
            t.tmat[4] = round( c * 1000) / 1000.0f;
            t.tmat[5] = round(ty * 1000) / 1000.0f;

            int nimages = in.dims[2];
            int nbatches = in.dims[3];

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            const int blocksXPerImage = blocks.x;
            const int blocksYPerImage = blocks.y;

            if(nimages > TI) {
                int tile_images = divup(nimages, TI);
                nimages = TI;
                blocks.x = blocks.x * tile_images;
            }

            blocks.y = blocks.y * nbatches;

            CUDA_LAUNCH((rotate_kernel<T, method>), blocks, threads,
                    out, in, t, nimages, nbatches, blocksXPerImage, blocksYPerImage);

            POST_LAUNCH_CHECK();
        }
예제 #17
0
   /// initializes the operand stack state for the given bytecode index, typically called at the beginning of a basic block
   ///   returns the bytecode index provided.
   virtual int32_t setupBBStartContext(int32_t index)
      {
      if (_stacks[index] != NULL)
         {
         *_stack = *_stacks[index];
         _stackTemps = *_stacks[index];
         }
      else
         {
         if (_stack)
            _stack->clear();
         _stackTemps.clear();
         }

      _block = blocks(index);
      return index;
      }
예제 #18
0
    static void bcast_first_launcher(Param<To> out,
                                     CParam<To> tmp,
                                     const uint blocks_x,
                                     const uint blocks_y,
                                     const uint threads_x)
    {

        dim3 threads(threads_x, THREADS_PER_BLOCK / threads_x);
        dim3 blocks(blocks_x * out.dims[2],
                    blocks_y * out.dims[3]);

        uint lim = divup(out.dims[0], (threads_x * blocks_x));

        CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim);

        POST_LAUNCH_CHECK();
    }
예제 #19
0
파일: libgolem.c 프로젝트: tkerber/golem
// uri_is_blocked provides a thin wrapper around the block RPC call, to
// deal with errors and provide the page uri.
static gboolean
uri_is_blocked(const char *uri, guint64 flags, Exten *exten)
{
    GError *err = NULL;
    gboolean ret = blocks(
            uri,
            webkit_web_page_get_uri(exten->web_page),
            flags,
            exten,
            &err);
    if(err != NULL) {
        printf("Failed to check if uri is blocked: %s\n", err->message);
        g_error_free(err);
        return false;
    }
    return ret;
}
예제 #20
0
void histogram(Param<outType> out, CParam<inType> in, int nbins, float minval, float maxval)
{
    dim3 threads(kernel::THREADS_X, 1);

    int nElems = in.dims[0] * in.dims[1];
    int blk_x  = divup(nElems, THRD_LOAD*THREADS_X);

    dim3 blocks(blk_x * in.dims[2], in.dims[3]);

    // If nbins > MAX_BINS, we are using global memory so smem_size can be 0;
    int smem_size = nbins <= MAX_BINS ? (nbins * sizeof(outType)) : 0;

    CUDA_LAUNCH_SMEM((histogramKernel<inType, outType, isLinear>), blocks, threads, smem_size,
            out, in, nElems, nbins, minval, maxval, blk_x);

    POST_LAUNCH_CHECK();
}
예제 #21
0
	std::string picker_log_alert::message() const
	{
		static char const* const flag_names[] =
		{
			"partial_ratio ",
			"prioritize_partials ",
			"rarest_first_partials ",
			"rarest_first ",
			"reverse_rarest_first ",
			"suggested_pieces ",
			"prio_sequential_pieces ",
			"sequential_pieces ",
			"reverse_pieces ",
			"time_critical ",
			"random_pieces ",
			"prefer_contiguous ",
			"reverse_sequential ",
			"backup1 ",
			"backup2 ",
			"end_game "
		};

		std::string ret = peer_alert::message();

		boost::uint32_t flags = picker_flags;
		int idx = 0;
		ret += " picker_log [ ";
		for (; flags != 0; flags >>= 1, ++idx)
		{
			if ((flags & 1) == 0) continue;
			ret += flag_names[idx];
		}
		ret += "] ";

		std::vector<piece_block> b = blocks();

		for (int i = 0; i < int(b.size()); ++i)
		{
			char buf[50];
			snprintf(buf, sizeof(buf), "(%d,%d) "
				, b[i].piece_index, b[i].block_index);
			ret += buf;
		}
		return ret;
	}
예제 #22
0
    static void bcast_dim_launcher(Param<To> out,
                                   CParam<To> tmp,
                                   const uint threads_y,
                                   const uint blocks_all[4])
    {

        dim3 threads(THREADS_X, threads_y);

        dim3 blocks(blocks_all[0] * blocks_all[2],
                    blocks_all[1] * blocks_all[3]);

        uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

        CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads,
            out, tmp, blocks_all[0], blocks_all[1], blocks_all[dim], lim);

        POST_LAUNCH_CHECK();
    }
예제 #23
0
bool isValidSudoku2(vector<vector<char> > &board) {//极其巧妙的方法,慢慢消化
    // Start typing your C/C++ solution below
    // DO NOT write int main() function
    vector<vector<bool> > rows(9, vector<bool>(9, false));
    vector<vector<bool> > cols(9, vector<bool>(9, false));
    vector<vector<bool> > blocks(9, vector<bool>(9, false));

    for (int i = 0; i < 9; ++i) {
        for (int j = 0; j < 9; ++j) {
            if (board[i][j] == '.') continue;
            int c = board[i][j] - '1';
            if (rows[i][c] || cols[j][c] || blocks[i - i % 3 + j / 3][c])
                return false;
            rows[i][c] = cols[j][c] = blocks[i - i % 3 + j / 3][c] = true;
        }
    }
    return true;
}
예제 #24
0
    shared_ptr<ChunkModelResult> compute_chunk(const ChunkModelData &data,
                                               const BlockTypeInfo &block_data) {
        std::vector<BlockData> blocks(XZ_SIZE * XZ_SIZE * XZ_SIZE);
        std::vector<char> highest(XZ_SIZE * XZ_SIZE);

        BlockData *above = data.above->blocks;
        BlockData *below = data.below->blocks;
        BlockData *left = data.left->blocks;
        BlockData *right = data.right->blocks;
        BlockData *front = data.front->blocks;
        BlockData *back = data.back->blocks;
        BlockData *above_left = data.above_left->blocks;
        BlockData *above_right = data.above_right->blocks;
        BlockData *above_front = data.above_front->blocks;
        BlockData *above_back = data.above_back->blocks;
        BlockData *above_left_front = data.above_left_front->blocks;
        BlockData *above_right_front = data.above_right_front->blocks;
        BlockData *above_left_back = data.above_left_back->blocks;
        BlockData *above_right_back = data.above_right_back->blocks;
        BlockData *left_front = data.left_front->blocks;
        BlockData *right_front = data.right_front->blocks;
        BlockData *left_back = data.left_back->blocks;
        BlockData *right_back = data.right_back->blocks;

        const char *is_transparent = block_data.is_transparent;
        const char *is_plant = block_data.is_plant;
        const char *state = block_data.state;

        int ox = - CHUNK_SIZE - 1;
        int oy = - CHUNK_SIZE - 1;
        int oz = - CHUNK_SIZE - 1;

        /* Populate the blocks array with the chunk itself */
        const BlockData *self = data.self->blocks;

        CHUNK_FOR_EACH(self, ex, ey, ez, eb) {
            int x = ex - ox;
            int y = ey - oy;
            int z = ez - oz;
            blocks[XYZ(x, y, z)] = eb;
            if (!is_transparent[eb.type]) {
                highest[XZ(x, z)] = std::max((int)highest[XZ(x, z)], y);
            }
        } END_CHUNK_FOR_EACH;
예제 #25
0
  void visit(TransID tid) {
    auto tidRegion = m_profData->transRegion(tid);
    auto tidInstrs = tidRegion->instrSize();
    if (m_numBCInstrs + tidInstrs > RuntimeOption::EvalJitMaxRegionInstrs) {
      return;
    }

    if (m_visited.count(tid)) return;
    m_visited.insert(tid);
    m_visiting.insert(tid);

    if (!breaksRegion(*(m_profData->transLastInstr(tid)))) {

      auto srcBlockId = tidRegion->blocks().back().get()->id();

      for (auto const arc : m_cfg.outArcs(tid)) {
        auto dst = arc->dst();

        // If dst is in the visiting set then this arc forms a cycle. Don't
        // include it unless we've asked for loops.
        if (!RuntimeOption::EvalJitLoops && m_visiting.count(dst)) continue;

        // Skip dst if we already generated a region starting at that SrcKey.
        auto dstSK = m_profData->transSrcKey(dst);
        if (m_profData->optimized(dstSK)) continue;

        auto dstBlockId = m_profData->transRegion(dst)->entry()->id();
        m_arcs.push_back({srcBlockId, dstBlockId});

        visit(dst);
      }
    }

    // Now insert the region for tid in the front of m_region.  We do
    // this last so that the region ends up in (quasi-)topological order
    // (it'll be in topological order for acyclic regions).
    m_region->prepend(*tidRegion);
    m_selectedSet.insert(tid);
    if (m_selectedVec) m_selectedVec->push_back(tid);
    m_numBCInstrs += tidRegion->instrSize();
    always_assert(m_numBCInstrs <= RuntimeOption::EvalJitMaxRegionInstrs);

    m_visiting.erase(tid);
  }
예제 #26
0
void ResourcePool::unlockBlocks( DWORD moduleSize ) {
    DWORD need = blocks( moduleSize );
	
	// printf( "[ResourcePool] unlock: id=%d max=%ld, allocated=%ld, free=%d, locked=%ld, availables=%ld, need=%ld\n",
	// 	_id, _max, _allocated, _freeBuffers.size(), _locked, availables(), need );

	_mFreeBuffers.lock();
	if (_locked >= need) {
		_locked -= need;
	}
	else {
		printf( "[ResourcePool] Warning, unlock blocks error: id=%d, need=%d, $locked=%d\n",
			_id, need, _locked );
		assert(false);
		_locked = 0;
	}
	_mFreeBuffers.unlock();
	_cWakeup.notify_all();
}
예제 #27
0
void nonMaximal(float* x_out, float* y_out, float* resp_out,
                 unsigned* count, const unsigned idim0, const unsigned idim1,
                 const T * resp_in, const unsigned edge, const unsigned max_corners)
{
    dim3 threads(BLOCK_X, BLOCK_Y);
    dim3 blocks(divup(idim0-edge*2, BLOCK_X), divup(idim1-edge*2, BLOCK_Y));

    unsigned* d_corners_found = memAlloc<unsigned>(1);
    CUDA_CHECK(cudaMemsetAsync(d_corners_found, 0, sizeof(unsigned),
                cuda::getStream(cuda::getActiveDeviceId())));

    CUDA_LAUNCH((nonMaxKernel<T>), blocks, threads,
            x_out, y_out, resp_out, d_corners_found, idim0, idim1, resp_in, edge, max_corners);

    POST_LAUNCH_CHECK();

    CUDA_CHECK(cudaMemcpy(count, d_corners_found, sizeof(unsigned), cudaMemcpyDeviceToHost));
    memFree(d_corners_found);
}
예제 #28
0
    int LeaseManager::renew(const int32_t timeout_ms, const int32_t who)
    {
      int ret = TFS_SUCCESS;
      DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance();
      DsRenewLeaseMessage req_msg;
      req_msg.set_ds_stat(ds_info.information_);
      if (is_master(who))
      {
        BlockInfoV2* block_infos = req_msg.get_block_infos();
        ArrayHelper<BlockInfoV2> blocks(MAX_WRITABLE_BLOCK_COUNT, block_infos);
        get_writable_block_manager().get_blocks(blocks, BLOCK_WRITABLE);
        req_msg.set_size(blocks.get_array_index());
      }

      tbnet::Packet* ret_msg = NULL;
      NewClient* new_client = NewClientManager::get_instance().create_client();
      ret = (NULL != new_client) ? TFS_SUCCESS : EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR;
      if (TFS_SUCCESS == ret)
      {
        ret = send_msg_to_server(ns_ip_port_[who], new_client, &req_msg, ret_msg, timeout_ms);
        if (TFS_SUCCESS == ret)
        {
          if (DS_RENEW_LEASE_RESPONSE_MESSAGE == ret_msg->getPCode())
          {
            DsRenewLeaseResponseMessage* resp_msg = dynamic_cast<DsRenewLeaseResponseMessage* >(ret_msg);
            process_renew_response(resp_msg, who);
          }
          else if (STATUS_MESSAGE == ret_msg->getPCode())
          {
            StatusMessage* resp_msg = dynamic_cast<StatusMessage*>(ret_msg);
            ret = resp_msg->get_status();
          }
          else
          {
            ret = EXIT_UNKNOWN_MSGTYPE;
          }
        }
        NewClientManager::get_instance().destroy_client(new_client);
      }

      return ret;
    }
예제 #29
0
        void select_scalar(Param<T> out, CParam<char> cond, CParam<T> a, const double b, int ndims)
        {
            dim3 threads(DIMX, DIMY);

            if (ndims == 1) {
                threads.x *= threads.y;
                threads.y = 1;
            }

            int blk_x = divup(out.dims[0], threads.x);
            int blk_y = divup(out.dims[1], threads.y);


            dim3 blocks(blk_x * threads.x,
                        blk_y * threads.y);

            CUDA_LAUNCH((select_scalar_kernel<T, flip>), blocks, threads,
                        out, cond, a, scalar<T>(b), blk_x, blk_y);

        }
예제 #30
0
    void transpose(Param<T> out, CParam<T> in, const int ndims)
    {
        // dimensions passed to this function should be input dimensions
        // any necessary transformations and dimension related calculations are
        // carried out here and inside the kernel
        dim3 threads(kernel::THREADS_X,kernel::THREADS_Y);


        int blk_x = divup(in.dims[0],TILE_DIM);
        int blk_y = divup(in.dims[1],TILE_DIM);
        // launch batch * blk_x blocks along x dimension
        dim3 blocks(blk_x * in.dims[2], blk_y * in.dims[3]);

        if (in.dims[0] % TILE_DIM == 0 && in.dims[1] % TILE_DIM == 0)
            CUDA_LAUNCH((transpose<T, conjugate, true >), blocks, threads, out, in, blk_x, blk_y);
        else
            CUDA_LAUNCH((transpose<T, conjugate, false>), blocks, threads, out, in, blk_x, blk_y);

        POST_LAUNCH_CHECK();
    }