예제 #1
0
void InternalThread::StartInternalThread() {
  // TODO switch to failing once Caffe prefetch thread is persistent.
  // Threads should not be started and stopped repeatedly.
  // CHECK(!is_started());
  StopInternalThread();

#ifndef CPU_ONLY
  CUDA_CHECK(cudaGetDevice(&device_));
#endif
  mode_ = Caffe::mode();
  rand_seed_ = caffe_rng_rand();
  solver_count_ = Caffe::solver_count();
  root_solver_ = Caffe::root_solver();

  try {
    thread_.reset(new boost::thread(&InternalThread::entry, this));
  } catch (std::exception& e) {
    CHECK(false) << e.what();
  }
}
예제 #2
0
        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            const dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            if (nimages > 1)     { blocks.x *= nimages;   }
            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
            } else {
예제 #3
0
float Timer::MilliSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING) << "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
  if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                    stop_gpu_));
#else
      NO_GPU;
#endif
  } else {
    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
  }
  return elapsed_milliseconds_;
}
예제 #4
0
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensor4dDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensor4dDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }
}
예제 #5
0
Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
예제 #6
0
void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
  if (X != Y) {
    // If there are more than one openmp thread (we are in active region)
    // then checking Caffe::mode can create additional GPU Context
    //
    if (
#ifdef _OPENMP
        (omp_in_parallel() == 0) &&
#endif
        (Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
      // NOLINT_NEXT_LINE(caffe/alt_fn)
      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
      NO_GPU;
#endif
    } else {
      caffe_cpu_copy<Dtype>(N, X, Y);
    }
  }
}
// 把数据放到cpu上
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
      own_cpu_data_ = true;
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
예제 #8
0
/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
           CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
        CUstream stream)
{
    int sizeX, sizeY;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;

    void *normalize_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map,
            (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
            (void *) &alfa, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = NUM_SECTOR * 2;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, normalize_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");

    return LATENT_SVM_OK;
}
예제 #9
0
/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
           CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
        CUstream stream)
{
    int sizeX, sizeY, p;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    p = dev_map_in->numFeatures;

    void *pca_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
            block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, pca_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");

    return LATENT_SVM_OK;
}
예제 #10
0
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    CHECK(cpu_ptr_ != 0) << "size " << size_;
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    break;
#if 0
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
#endif
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
예제 #11
0
파일: memory.cpp 프로젝트: hxiaox/arrayfire
    T* pinnedAlloc(const size_t &elements)
    {
        managerInit();
        T* ptr = NULL;
        // Allocate the higher megabyte. Overhead of creating pinned memory is
        // more so we want more resuable memory.
        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
                pinnedGarbageCollect();
            }

            for(mem_iter iter = pinned_maps.begin();
                iter != pinned_maps.end(); ++iter) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    pinned_used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
                pinnedGarbageCollect();
                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, false, alloc_bytes};
            pinned_maps[ptr] = info;
            pinned_used_bytes += alloc_bytes;
        }
        return (T*)ptr;
    }
예제 #12
0
float Timer::MicroSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING)<< "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
#ifdef USE_CUDA
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
            stop_gpu_));
    // Cuda only measure milliseconds
    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
  } else {
#endif
    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
  }
#endif
  return elapsed_microseconds_;
}
예제 #13
0
/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
          CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
    int sizeX, sizeY, xp;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    xp = dev_map_in->numFeatures;

    void *calc_norm_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &xp, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");

    return LATENT_SVM_OK;
}
예제 #14
0
void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
  //LOG(ERROR)<<"device_id is "<<device;
  CUDA_CHECK(cudaSetDevice(device));
#endif
  started_.store(true);
  MPIJob job;
  while (true){
    mutex::scoped_lock lock(queue_mutex_);
    while( task_queue_.empty() && IsRunning()){
      DLOG(INFO)<<"no job running, waiting on cond";
      cond_work_.wait(lock);
    }
    lock.unlock();

    DLOG(INFO)<<"Cond fulfilled, dispatching job";
    if (IsRunning()){
      job = task_queue_.front();
      DLOG(INFO)<<task_queue_.size();
      DispatchJob(job);
      mutex::scoped_lock pop_lock(queue_mutex_);
      task_queue_.pop();
      pop_lock.unlock();
      cond_finish_.notify_one();
      DLOG(INFO)<<"job finished, poped taskqueue";
    }else{
      break;
    }

  }

  // finish remaining jobs
  while (!task_queue_.empty()){
    boost::lock_guard<mutex> lock(queue_mutex_);
    job = task_queue_.front();
    task_queue_.pop();
    DispatchJob(job);
  }
}
예제 #15
0
Array<T>  morph(const Array<T> &in, const Array<T> &mask)
{
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
    if (mdims[0] > 19)
        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                       mdims[0] * mdims[1] * sizeof(T),
                                       0, cudaMemcpyDeviceToDevice,
                                       cuda::getStream(cuda::getActiveDeviceId())));

    if (isDilation)
        kernel::morph<T, true >(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
예제 #16
0
    T* memAlloc(const size_t &elements)
    {
        int n = getActiveDeviceId();
        T* ptr = NULL;
        size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) {
                garbageCollect();
            }

            for(mem_iter iter = memory_maps[n].begin();
                iter != memory_maps[n].end(); iter++) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {
                garbageCollect();
                CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, alloc_bytes};
            memory_maps[n][ptr] = info;
            used_bytes += alloc_bytes;
        }
        return ptr;
    }
예제 #17
0
SocketBuffer* SocketBuffer::Read(bool data) {
  // Pop the message from local queue
  QueuedMessage* qm = NULL;
  if(data) {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue.pop());
#ifndef CPU_ONLY
    // Copy the received buffer to GPU memory
    CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer,  // NOLINT(caffe/alt_fn)
               qm->size, cudaMemcpyHostToDevice));  // NOLINT(caffe/alt_fn)
#else
    //caffe_copy(qm->size, qm->buffer, this->addr_);
    memcpy(this->addr_, qm->buffer, qm->size);
#endif
  } else {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue_ctrl.pop());
  }
  // Free up the buffer and the wrapper object
  if(data)
    delete qm->buffer;
  delete qm;
  return this;
}
예제 #18
0
P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = solver_->param().device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent_) {
        CUDA_CHECK(cudaFree(parent_grads_));
        const int peer = parent_->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
        }
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}
예제 #19
0
void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
#ifndef CPU_ONLY
    vector<int> remaining(devices);

    // Depth for reduction tree
    int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));

    // Group GPUs by board
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                cudaDeviceProp a, b;
                CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
                CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
                if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
                    if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
                        pairs->push_back(DevicePair(remaining[i], remaining[j]));
                        DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
                        remaining.erase(remaining.begin() + j);
                        break;
                    }
                }
            }
        }
    }
    ostringstream s;
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();

    // Group by P2P accessibility
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                int access;
                CUDA_CHECK(
                    cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
                if (access) {
                    pairs->push_back(DevicePair(remaining[i], remaining[j]));
                    DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
                    remaining.erase(remaining.begin() + j);
                    break;
                }
            }
        }
    }
    s.str("");
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();

    // Group remaining
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
            DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
                       << remaining[i + 1];
            remaining.erase(remaining.begin() + i + 1);
        }
    }

    // Should only be the parent node remaining
    CHECK_EQ(remaining.size(), 1);

    pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));

    CHECK(pairs->size() == devices.size());
    for (int i = 0; i < pairs->size(); ++i) {
        CHECK((*pairs)[i].parent() != (*pairs)[i].device());
        for (int j = i + 1; j < pairs->size(); ++j) {
            CHECK((*pairs)[i].device() != (*pairs)[j].device());
        }
    }
#else
    NO_GPU;
#endif
}
예제 #20
0
GPUParams<Dtype>::~GPUParams() {
#ifndef CPU_ONLY
    CUDA_CHECK(cudaFree(data_));
    CUDA_CHECK(cudaFree(diff_));
#endif
}
예제 #21
0
파일: orb.hpp 프로젝트: pavanky/arrayfire
void orb(unsigned* out_feat,
         float** d_x,
         float** d_y,
         float** d_score,
         float** d_ori,
         float** d_size,
         unsigned** d_desc,
         std::vector<unsigned>& feat_pyr,
         std::vector<float*>& d_x_pyr,
         std::vector<float*>& d_y_pyr,
         std::vector<unsigned>& lvl_best,
         std::vector<float>& lvl_scl,
         std::vector<CParam<T> >& img_pyr,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels)
{
    unsigned patch_size = REF_PAT_SIZE;

    unsigned max_levels = feat_pyr.size();

    // In future implementations, the user will be capable of passing his
    // distribution instead of using the reference one
    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));

    std::vector<float*> d_score_pyr(max_levels);
    std::vector<float*> d_ori_pyr(max_levels);
    std::vector<float*> d_size_pyr(max_levels);
    std::vector<unsigned*> d_desc_pyr(max_levels);
    std::vector<unsigned*> d_idx_pyr(max_levels);

    unsigned total_feat = 0;

    // Calculate a separable Gaussian kernel
    unsigned gauss_len = 9;
    convAccT* h_gauss = new convAccT[gauss_len];
    gaussian1D(h_gauss, gauss_len, 2.f);
    Param<convAccT> gauss_filter;
    gauss_filter.dims[0] = gauss_len;
    gauss_filter.strides[0] = 1;

    for (int k = 1; k < 4; k++) {
        gauss_filter.dims[k] = 1;
        gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];
    }

    dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
    gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));

    delete[] h_gauss;

    for (int i = 0; i < (int)max_levels; i++) {
        if (feat_pyr[i] == 0 || lvl_best[i] == 0) {
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        unsigned* d_usable_feat = memAlloc<unsigned>(1);
        CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));

        float* d_x_harris = memAlloc<float>(feat_pyr[i]);
        float* d_y_harris = memAlloc<float>(feat_pyr[i]);
        float* d_score_harris = memAlloc<float>(feat_pyr[i]);

        // Calculate Harris responses
        // Good block_size >= 7 (must be an odd number)
        dim3 threads(THREADS_X, THREADS_Y);
        dim3 blocks(divup(feat_pyr[i], threads.x), 1);
        harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,
                                                      d_x_pyr[i], d_y_pyr[i], NULL,
                                                      feat_pyr[i], d_usable_feat,
                                                      img_pyr[i], 7, 0.04f, patch_size);
        POST_LAUNCH_CHECK();

        unsigned usable_feat = 0;
        CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_usable_feat);

        feat_pyr[i] = usable_feat;

        if (feat_pyr[i] == 0) {
            memFree(d_x_harris);
            memFree(d_y_harris);
            memFree(d_score_harris);
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        Param<float> harris_sorted;
        Param<unsigned> harris_idx;

        harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];
        harris_sorted.strides[0] = harris_idx.strides[0] = 1;

        for (int k = 1; k < 4; k++) {
            harris_sorted.dims[k] = 1;
            harris_sorted.strides[k] = harris_sorted.dims[k - 1] * harris_sorted.strides[k - 1];
            harris_idx.dims[k] = 1;
            harris_idx.strides[k] = harris_idx.dims[k - 1] * harris_idx.strides[k - 1];
        }

        dim_type sort_elem = harris_sorted.strides[3] * harris_sorted.dims[3];
        harris_sorted.ptr = d_score_harris;
        harris_idx.ptr = memAlloc<unsigned>(sort_elem);

        // Sort features according to Harris responses
        sort0_index<float, false>(harris_sorted, harris_idx);

        feat_pyr[i] = std::min(feat_pyr[i], lvl_best[i]);

        float* d_x_lvl = memAlloc<float>(feat_pyr[i]);
        float* d_y_lvl = memAlloc<float>(feat_pyr[i]);
        float* d_score_lvl = memAlloc<float>(feat_pyr[i]);

        // Keep only features with higher Harris responses
        threads = dim3(THREADS, 1);
        blocks = dim3(divup(feat_pyr[i], threads.x), 1);
        keep_features<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_score_lvl, NULL,
                                              d_x_harris, d_y_harris, harris_sorted.ptr, harris_idx.ptr,
                                              NULL, feat_pyr[i]);
        POST_LAUNCH_CHECK();

        memFree(d_x_harris);
        memFree(d_y_harris);
        memFree(harris_sorted.ptr);
        memFree(harris_idx.ptr);

        float* d_ori_lvl = memAlloc<float>(feat_pyr[i]);

        // Compute orientation of features
        threads = dim3(THREADS_X, THREADS_Y);
        blocks  = dim3(divup(feat_pyr[i], threads.x), 1);
        centroid_angle<T><<<blocks, threads>>>(d_x_lvl, d_y_lvl, d_ori_lvl, feat_pyr[i],
                                               img_pyr[i], patch_size);
        POST_LAUNCH_CHECK();

        Param<T> lvl_tmp;
        Param<T> lvl_filt;

        for (int k = 0; k < 4; k++) {
            lvl_tmp.dims[k] = img_pyr[i].dims[k];
            lvl_tmp.strides[k] = img_pyr[i].strides[k];
            lvl_filt.dims[k] = img_pyr[i].dims[k];
            lvl_filt.strides[k] = img_pyr[i].strides[k];
        }

        dim_type lvl_elem = img_pyr[i].strides[3] * img_pyr[i].dims[3];
        lvl_tmp.ptr = memAlloc<T>(lvl_elem);
        lvl_filt.ptr = memAlloc<T>(lvl_elem);

        // Separable Gaussian filtering to reduce noise sensitivity
        convolve2<T, convAccT, 0, false>(lvl_tmp, img_pyr[i], gauss_filter);
        convolve2<T, convAccT, 1, false>(lvl_filt, CParam<T>(lvl_tmp), gauss_filter);

        memFree(lvl_tmp.ptr);
        if (i > 0) {
            memFree((T*)img_pyr[i].ptr);
        }

        img_pyr[i].ptr = lvl_filt.ptr;
        for (int k = 0; k < 4; k++) {
            img_pyr[i].dims[k] = lvl_filt.dims[k];
            img_pyr[i].strides[k] = lvl_filt.strides[k];
        }

        float* d_size_lvl = memAlloc<float>(feat_pyr[i]);

        unsigned* d_desc_lvl = memAlloc<unsigned>(feat_pyr[i] * 8);
        CUDA_CHECK(cudaMemset(d_desc_lvl, 0, feat_pyr[i] * 8 * sizeof(unsigned)));

        // Compute ORB descriptors
        threads = dim3(THREADS_X, THREADS_Y);
        blocks  = dim3(divup(feat_pyr[i], threads.x), 1);
        extract_orb<T><<<blocks, threads>>>(d_desc_lvl, feat_pyr[i],
                                            d_x_lvl, d_y_lvl, d_ori_lvl, d_size_lvl,
                                            img_pyr[i], lvl_scl[i], patch_size);
        POST_LAUNCH_CHECK();

        memFree((T*)img_pyr[i].ptr);

        // Store results to pyramids
        total_feat += feat_pyr[i];
        d_x_pyr[i] = d_x_lvl;
        d_y_pyr[i] = d_y_lvl;
        d_score_pyr[i] = d_score_lvl;
        d_ori_pyr[i] = d_ori_lvl;
        d_size_pyr[i] = d_size_lvl;
        d_desc_pyr[i] = d_desc_lvl;
    }

    memFree((T*)gauss_filter.ptr);

    // If no features are found, set found features to 0 and return
    if (total_feat == 0) {
        *out_feat = 0;
        return;
    }

    // Allocate output memory
    *d_x     = memAlloc<float>(total_feat);
    *d_y     = memAlloc<float>(total_feat);
    *d_score = memAlloc<float>(total_feat);
    *d_ori   = memAlloc<float>(total_feat);
    *d_size  = memAlloc<float>(total_feat);
    *d_desc  = memAlloc<unsigned>(total_feat * 8);
    unsigned offset = 0;
    for (unsigned i = 0; i < max_levels; i++) {
        if (feat_pyr[i] == 0)
            continue;

        if (i > 0)
            offset += feat_pyr[i-1];

        CUDA_CHECK(cudaMemcpy(*d_x+offset, d_x_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_y+offset, d_y_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_score+offset, d_score_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_ori+offset, d_ori_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));
        CUDA_CHECK(cudaMemcpy(*d_size+offset, d_size_pyr[i], feat_pyr[i] * sizeof(float), cudaMemcpyDeviceToDevice));

        CUDA_CHECK(cudaMemcpy(*d_desc+(offset*8), d_desc_pyr[i], feat_pyr[i] * 8 * sizeof(unsigned), cudaMemcpyDeviceToDevice));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_score_pyr[i]);
        memFree(d_ori_pyr[i]);
        memFree(d_size_pyr[i]);
        memFree(d_desc_pyr[i]);
    }

    // Sets number of output features
    *out_feat = total_feat;
}
예제 #22
0
/*
// Getting feature map for the selected subimage in GPU
//
// API
//int getFeatureMapsGPUStream(const int numStep, const int k,
          CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,
          CUstream *streams)
// INPUT
// numStep
// k
// devs_img
// streams
// OUTPUT
// devs_map
// RESULT
// Error status
*/
int getFeatureMapsGPUStream(const int numStep, const int k,
        CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map,
        CUstream *streams)
{
    int sizeX, sizeY;
    int p, px;
    int height, width;
    int i, j;

    int *nearest;
    float *w, a_x, b_x;

    int size_r, size_alfa, size_nearest, size_w, size_map;

    CUresult res;
    CvLSVMFeatureMapGPU **devs_r, **devs_alfa;
    CUdeviceptr dev_nearest, dev_w;

    px = 3 * NUM_SECTOR;
    p = px;

    size_nearest = k;
    size_w = k * 2;

    devs_r = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * numStep);
    devs_alfa = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * numStep);
    nearest = (int *) malloc(sizeof(int) * size_nearest);
    w = (float *) malloc(sizeof(float) * size_w);

    // initialize "nearest" and "w"
    for (i = 0; i < k / 2; i++)
    {
        nearest[i] = -1;
    }/*for(i = 0; i < k / 2; i++)*/
    for (i = k / 2; i < k; i++)
    {
        nearest[i] = 1;
    }/*for(i = k / 2; i < k; i++)*/

    for (j = 0; j < k / 2; j++)
    {
        b_x = k / 2 + j + 0.5f;
        a_x = k / 2 - j - 0.5f;
        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));
        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));
    }/*for(j = 0; j < k / 2; j++)*/
    for (j = k / 2; j < k; j++)
    {
        a_x = j - k / 2 + 0.5f;
        b_x = -j + k / 2 - 0.5f + k;
        w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x));
        w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x));
    }/*for(j = k / 2; j < k; j++)*/

    res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest);
    CUDA_CHECK(res, "cuMemAlloc(dev_nearest)");
    res = cuMemAlloc(&dev_w, sizeof(float) * size_w);
    CUDA_CHECK(res, "cuMemAlloc(dev_w)");

    res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest,
            streams[numStep - 1]);
    res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w,
            streams[numStep - 1]);

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        width = devs_img[i]->sizeX;
        height = devs_img[i]->sizeY;

        allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1);
        allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2);
    }

    // excute async
    for (i = 0; i < numStep; i++)
    {
        // initialize "map", "r" and "alfa"
        width = devs_img[i]->sizeX;
        height = devs_img[i]->sizeY;
        sizeX = width / k;
        sizeY = height / k;
        size_map = sizeX * sizeY * p;
        size_r = width * height;
        size_alfa = width * height * 2;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_map[i]->map, 0, size_map, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map)");
        res = cuMemsetD32Async(devs_r[i]->map, 0, size_r, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_r)");
        res = cuMemsetD32Async(devs_alfa[i]->map, 0, size_alfa, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_alfa)");

        // launch kernel
        calculateHistogramGPULaunch(k, devs_img[i], devs_r[i], devs_alfa[i],
                streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        getFeatureMapsGPULaunch(k, devs_r[i], devs_alfa[i], &dev_nearest,
                &dev_w, devs_map[i], streams[i]);
    }

    // free device memory
    res = cuMemFree(dev_nearest);
    CUDA_CHECK(res, "cuMemFree(dev_nearest)");
    res = cuMemFree(dev_w);
    CUDA_CHECK(res, "cuMemFree(dev_w)");

    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_r[i]);
        freeFeatureMapObjectGPU(&devs_alfa[i]);
    }

    free(nearest);
    free(w);
    free(devs_r);
    free(devs_alfa);

    return LATENT_SVM_OK;
}
예제 #23
0
 CudaEvent::CudaEvent( ) : isRecorded( false ), finished( true ), refCounter( 0u )
 {
     log( ggLog::CUDA_RT()+ggLog::MEMORY(), "create event" );
     CUDA_CHECK( cudaEventCreateWithFlags( &event, cudaEventDisableTiming ) );
 }
예제 #24
0
/*
// Feature map reduction in GPU
// In each cell we reduce dimension of the feature vector
// according to original paper special procedure
//
// API
//int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
          CUstream *streams)
// INPUT
// numStep
// bx
// by
// devs_map_in
// streams
// OUTPUT
// feature_maps
// RESULT
// Error status
*/
int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
        CUstream *streams)
{

    int sizeX, sizeY, pp;
    int size_map_pca;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_map_pca;

    pp = NUM_SECTOR * 3 + 4;

    devs_map_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX + 2 * bx;
        sizeY = devs_map_in[i]->sizeY + 2 * by;

        size_map_pca = sizeX * sizeY * pp;

        allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp);
        allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_pca)");

        // launch kernel
        PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i],
                devs_map_pca[i], bx, by, streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // copy memory from device to host
        res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map,
                sizeof(float) * size_map_pca, streams[i]);
        CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)");
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_map_pca[i]);
    }

    free(devs_map_pca);

    return LATENT_SVM_OK;
}
예제 #25
0
/*
// Feature map Normalization and Truncation in GPU
//
// API
//int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
          CUstream *streams)
// INPUT
// numStep
// alfa
// devs_map_in
// streams
// OUTPUT
// devs_map_out
// RESULT
// Error status
*/
int normalizeAndTruncateGPUStream(const int numStep, const float alfa,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMapGPU **devs_map_out,
        CUstream *streams)
{

    int sizeX, sizeY, newSizeX, newSizeY, pp;
    int size_norm, size_map_out;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_norm;

    pp = NUM_SECTOR * 12;

    devs_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_norm[i], sizeX, sizeY, 1);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX;
        sizeY = devs_map_in[i]->sizeY;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;
        size_norm = sizeX * sizeY;
        size_map_out = newSizeX * newSizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_norm[i]->map, 0, size_norm, streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_norm)");
        res = cuMemsetD32Async(devs_map_out[i]->map, 0, size_map_out,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_out)");

        // launch kernel
        calculateNormGPULaunch(devs_map_in[i], devs_norm[i], streams[i]);

    }

    for (i = 0; i < numStep; i++)
    {
        // launch kernel
        normalizeGPULaunch(alfa, devs_map_in[i], devs_norm[i], devs_map_out[i],
                streams[i]);
    }

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_norm[i]);
    }

    free(devs_norm);

    return LATENT_SVM_OK;
}
예제 #26
0
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {

  init_cpu = false;
  init_gpu = false;
  const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param();

  num_iterations_ = meanfield_param.num_iterations();

  CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1.";

  theta_alpha_ = meanfield_param.theta_alpha();
  theta_beta_ = meanfield_param.theta_beta();
  theta_gamma_ = meanfield_param.theta_gamma();

  count_ = bottom[0]->count();
  num_ = bottom[0]->num();
  channels_ = bottom[0]->channels();
  height_ = bottom[0]->height();
  width_ = bottom[0]->width();
  num_pixels_ = height_ * width_;

  LOG(INFO) << "This implementation has not been tested batch size > 1.";

  top[0]->Reshape(num_, channels_, height_, width_);

  // Initialize the parameters that will updated by backpropagation.
  if (this->blobs_.size() > 0) {
    LOG(INFO) << "Multimeanfield layer skipping parameter initialization.";
  } else {

    this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix

    // Allocate space for kernel weights.
    this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_));

    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data());
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data());

    // Initialize the kernels weights. The two files spatial.par and bilateral.par should be available.
    FILE * pFile;
    pFile = fopen("spatial.par", "r");
    CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    pFile = fopen("bilateral.par", "r");
    CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights.";
    for (int i = 0; i < channels_; i++) {
      fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]);
    }
    fclose(pFile);

    // Initialize the compatibility matrix.
    this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_));
    caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data());

    // Initialize it to have the Potts model.
    for (int c = 0; c < channels_; ++c) {
      (this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.);
    }
  }

  float spatial_kernel[2 * num_pixels_];
  float *spatial_kernel_gpu_;
  compute_spatial_kernel(spatial_kernel);
  spatial_lattice_.reset(new ModifiedPermutohedral());
  spatial_norm_.Reshape(1, 1, height_, width_);
  Dtype* norm_data_gpu ;
  Dtype*  norm_data;
  // Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size.
  switch (Caffe::mode()) {
    case Caffe::CPU:
      norm_data = spatial_norm_.mutable_cpu_data();
      spatial_lattice_->init(spatial_kernel, 2, width_, height_);
      // Calculate spatial filter normalization factors.
      norm_feed_= new Dtype[num_pixels_];
      caffe_set(num_pixels_, Dtype(1.0), norm_feed_);
      // pass norm_feed and norm_data to gpu
      spatial_lattice_->compute(norm_data, norm_feed_, 1);
      bilateral_kernel_buffer_ = new float[5 * num_pixels_];
      init_cpu = true;
      break;
    #ifndef CPU_ONLY
    case Caffe::GPU:
      CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ;
      spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_);
      CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ;
      caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_);
      norm_data_gpu = spatial_norm_.mutable_gpu_data();
      spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1); 
      norm_data = spatial_norm_.mutable_cpu_data();
      CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ;
      CUDA_CHECK(cudaFree(spatial_kernel_gpu_));
      init_gpu = true;
      break;
    #endif
    default:
    LOG(FATAL) << "Unknown caffe mode.";
  }
  
  for (int i = 0; i < num_pixels_; ++i) {
    norm_data[i] = 1.0f / (norm_data[i] + 1e-20f);
  }
  bilateral_norms_.Reshape(num_, 1, height_, width_);  

  // Configure the split layer that is used to make copies of the unary term. One copy for each iteration.
  // It may be possible to optimize this calculation later.
  split_layer_bottom_vec_.clear();
  split_layer_bottom_vec_.push_back(bottom[0]);

  split_layer_top_vec_.clear();

  split_layer_out_blobs_.resize(num_iterations_);
  for (int i = 0; i < num_iterations_; i++) {
    split_layer_out_blobs_[i].reset(new Blob<Dtype>());
    split_layer_top_vec_.push_back(split_layer_out_blobs_[i].get());
  }

  LayerParameter split_layer_param;
  split_layer_.reset(new SplitLayer<Dtype>(split_layer_param));
  split_layer_->SetUp(split_layer_bottom_vec_, split_layer_top_vec_);

  // Make blobs to store outputs of each meanfield iteration. Output of the last iteration is stored in top[0].
  // So we need only (num_iterations_ - 1) blobs.
  iteration_output_blobs_.resize(num_iterations_ - 1);
  for (int i = 0; i < num_iterations_ - 1; ++i) {
    iteration_output_blobs_[i].reset(new Blob<Dtype>(num_, channels_, height_, width_));
  }
  // Make instances of MeanfieldIteration and initialize them.
  meanfield_iterations_.resize(num_iterations_);
  for (int i = 0; i < num_iterations_; ++i) {
    meanfield_iterations_[i].reset(new MeanfieldIteration<Dtype>());
    meanfield_iterations_[i]->OneTimeSetUp(
        split_layer_out_blobs_[i].get(), // unary terms
        (i == 0) ? bottom[1] : iteration_output_blobs_[i - 1].get(), // softmax input
        (i == num_iterations_ - 1) ? top[0] : iteration_output_blobs_[i].get(), // output blob
        spatial_lattice_, // spatial lattice
        &spatial_norm_); // spatial normalization factors.
  }
  this->param_propagate_down_.resize(this->blobs_.size(), true);
  LOG(INFO) << ("MultiStageMeanfieldLayer initialized.");
}
예제 #27
0
/*
// Property Message
//
// API
//static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
          int numStep, int startIndex, int sideLength, int bx, int by,
          CvLSVMFeaturePyramid **maps)
// INPUT
// image
// step
// numStep
// startIndex
// sideLength
// bx
// by
// OUTPUT
// maps
// RESULT
// Error status
*/
static int getPathOfFeaturePyramidGPUStream(IplImage * image, float step,
        int numStep, int startIndex, int sideLength, int bx, int by,
        CvLSVMFeaturePyramid **maps)
{
    CvLSVMFeatureMap **feature_maps;

    int i;
    int width, height, numChannels, sizeX, sizeY, p, pp, newSizeX, newSizeY;
    float *scales;
    CvLSVMFeatureMapGPU **devs_img, **devs_map_pre_norm, **devs_map_pre_pca;
    CUstream *streams;
    CUresult res;

    scales = (float *) malloc(sizeof(float) * (numStep));
    devs_img = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_norm = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    devs_map_pre_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));
    streams = (CUstream *) malloc(sizeof(CUstream) * (numStep));
    feature_maps = (CvLSVMFeatureMap **) malloc(
            sizeof(CvLSVMFeatureMap *) * (numStep));

    // allocate device memory
    for (i = 0; i < numStep; i++)
    {
        scales[i] = 1.0f / powf(step, (float) i);
        width  = (int) (((float) image->width ) * scales[i] + 0.5);
        height = (int) (((float) image->height) * scales[i] + 0.5);
        numChannels = image->nChannels;
        sizeX = width  / sideLength;
        sizeY = height / sideLength;
        p  = NUM_SECTOR * 3;
        pp = NUM_SECTOR * 12;
        newSizeX = sizeX - 2;
        newSizeY = sizeY - 2;

        allocFeatureMapObjectGPU<float>(&devs_img[i], width, height,
                numChannels);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_norm[i], sizeX, sizeY, p);
        allocFeatureMapObjectGPU<float>(&devs_map_pre_pca[i], newSizeX,
                newSizeY, pp);
        res = cuStreamCreate(&streams[i], CU_STREAM_DEFAULT);
        CUDA_CHECK(res, "cuStreamCreate(stream)");
    }

    // excute main function
    resizeGPUStream(numStep, image, scales, devs_img, streams);

    getFeatureMapsGPUStream(numStep, sideLength, devs_img, devs_map_pre_norm,
            streams);

    normalizeAndTruncateGPUStream(numStep, Val_Of_Truncate, devs_map_pre_norm,
            devs_map_pre_pca, streams);

    PCAFeatureMapsGPUStream(numStep, bx, by, devs_map_pre_pca, feature_maps,
            streams);

    // synchronize cuda stream
    for (i = 0; i < numStep; i++)
    {
        cuStreamSynchronize(streams[i]);
        cuStreamDestroy(streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        (*maps)->pyramid[startIndex + i] = feature_maps[i];
    }/*for(i = 0; i < numStep; i++)*/

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_img[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_norm[i]);
        freeFeatureMapObjectGPU(&devs_map_pre_pca[i]);
    }

    free(scales);
    free(devs_img);
    free(devs_map_pre_norm);
    free(devs_map_pre_pca);
    free(streams);
    free(feature_maps);

    return LATENT_SVM_OK;
}
예제 #28
0
파일: pocl-cuda.c 프로젝트: jrprice/pocl
cl_int
pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
{
  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);

  CUresult result;
  void *b = NULL;

  /* if memory for this global memory is not yet allocated -> do it */
  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
    {
      cl_mem_flags flags = mem_obj->flags;

      if (flags & CL_MEM_USE_HOST_PTR)
        {
#if defined __arm__
          // cuMemHostRegister is not supported on ARN
          // Allocate device memory and perform explicit copies
          // before and after running a kernel
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          CUDA_CHECK (result, "cuMemAlloc");
#else
          result = cuMemHostRegister (host_ptr, mem_obj->size,
                                      CU_MEMHOSTREGISTER_DEVICEMAP);
          if (result != CUDA_SUCCESS
              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
            CUDA_CHECK (result, "cuMemHostRegister");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
#endif
        }
      else if (flags & CL_MEM_ALLOC_HOST_PTR)
        {
          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,
                                   CU_MEMHOSTREGISTER_DEVICEMAP);
          CUDA_CHECK (result, "cuMemHostAlloc");
          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,
                                              mem_obj->mem_host_ptr, 0);
          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
        }
      else
        {
          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
          if (result != CUDA_SUCCESS)
            {
              const char *err;
              cuGetErrorName (result, &err);
              POCL_MSG_PRINT2 (__FUNCTION__, __LINE__,
                               "-> Failed to allocate memory: %s\n", err);
              return CL_MEM_OBJECT_ALLOCATION_FAILURE;
            }
        }

      if (flags & CL_MEM_COPY_HOST_PTR)
        {
          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);
          CUDA_CHECK (result, "cuMemcpyHtoD");
        }

      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;
      mem_obj->device_ptrs[device->global_mem_id].global_mem_id
          = device->global_mem_id;
    }

  /* copy already allocated global mem info to devices own slot */
  mem_obj->device_ptrs[device->dev_id]
      = mem_obj->device_ptrs[device->global_mem_id];

  return CL_SUCCESS;
}
예제 #29
0
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
예제 #30
0
void Caffe::SetDevice(const int device_id) {
  root_device_ = device_id;
  CUDA_CHECK(cudaSetDevice(root_device_));
}