Ejemplo n.º 1
0
/*
 * Based on the implementation of the THTensor_(indexCopy) in torch7
 */
static void THCudaTensor_indexCopy(THCudaTensor *tensor, int dim, THLongTensor *index, THCudaTensor *src)
{
  long i, numel;
  THCudaTensor *tSlice, *sSlice;
  long *index_data;

  numel = THLongTensor_nElement(index);
  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
  THArgCheck(dim < src->nDimension,4,"Indexing dim is out of bounds");

  index = THLongTensor_newContiguous(index);
  index_data = THLongTensor_data(index);

  for (i=0; i<numel; i++)
  {
    if (tensor->nDimension > 1 )
    {
      tSlice = THCudaTensor_new();
      sSlice = THCudaTensor_new();
      THCudaTensor_select(tSlice, tensor, dim, index_data[i]-1);
      THCudaTensor_select(sSlice, src, dim, i);
      THCudaTensor_copy(tSlice, sSlice);
      THCudaTensor_free(tSlice);
      THCudaTensor_free(sSlice);
    }
    else
    {
      // It's faster to copy a float from an address in the device to another address in the device than 
      // retrieving it to the host memory and recopy it to the device memory
      THCudaCheck(cudaMemcpy(tensor->storage->data + tensor->storageOffset + index_data[i]-1,\
        src->storage->data + src->storageOffset + i, sizeof(float), cudaMemcpyDeviceToDevice));
    }
  }
  THLongTensor_free(index);
}
Ejemplo n.º 2
0
void THCudaTensor_freeCopyTo(THCState *state, THCudaTensor *self, THCudaTensor *dst)
{
  if(self != dst)
    THCudaTensor_copy(state, dst, self);

  THCudaTensor_free(state, self);
}
Ejemplo n.º 3
0
/*
 * Based on the implementation of the THTensor_(indexCopy) in torch7
 */
static void THCudaTensor_indexFill(THCudaTensor *tensor, int dim, THLongTensor *index, float val)
{
  long i, numel;
  THCudaTensor *tSlice;
  long *index_data;

  numel = THLongTensor_nElement(index);
  THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector");
  THArgCheck(dim < tensor->nDimension,4,"Indexing dim is out of bounds");

  index = THLongTensor_newContiguous(index);
  index_data = THLongTensor_data(index);
  
  for (i=0; i<numel; i++)
  {
    if (tensor->nDimension > 1 )
    {
      // create a new CudaTensor
      tSlice = THCudaTensor_new();
      // set its storage to point to the corresponding storage in tensor
      THCudaTensor_select(tSlice, tensor,dim,index_data[i]-1);
      THCudaTensor_fill(tSlice, val);
      THCudaTensor_free(tSlice);
    }
    else
    {
      THCudaTensor_set1d(tensor,index_data[i]-1,val);
    }
  }
  THLongTensor_free(index);
}
Ejemplo n.º 4
0
void THFloatTensor_copyCuda(THFloatTensor *self, struct THCudaTensor *src)
{
  THArgCheck(THFloatTensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match"); 

  {
    THFloatTensor *selfc = THFloatTensor_newContiguous(self);
    src = THCudaTensor_newContiguous(src);

    THCudaCheck(cudaMemcpy(selfc->storage->data + selfc->storageOffset, src->storage->data + src->storageOffset, THCudaTensor_nElement(src) * sizeof(float), cudaMemcpyDeviceToHost));

    THCudaTensor_free(src);
    THFloatTensor_freeCopyTo(selfc, self);
  }
}
Ejemplo n.º 5
0
// Copy extracted patches to CUDA memory and run the network
// One has to keep mind that GPU memory is limited and extracting too many patches
// at once might cause troubles
// So if you need to extract a lot of patches, an efficient way would be to
// devide the set in smaller equal parts and preallocate CPU and GPU memory
void extractDescriptors(THCState *state,
    cunn::Sequential::Ptr net,
    const std::vector<cv::Mat>& patches,
    cv::Mat& descriptors)
{
  size_t batch_size = 128;
  size_t N = patches.size();

  THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M);
  THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M);

  for(int j=0; j < ceil((float)N/batch_size); ++j)
  {
    float *data = THFloatTensor_data(buffer);
    size_t k = 0;
    for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k)
      memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M);

    // initialize 4D CUDA tensor and copy patches into it
    THCudaTensor_copyFloat(state, input, buffer);

    // propagate through the network
    THCudaTensor *output = net->forward(input);

    // copy descriptors back
    THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]);
    THFloatTensor_copyCuda(state, desc, output);

    size_t feature_dim = output->size[1];
    if(descriptors.cols != feature_dim || descriptors.rows != N)
      descriptors.create(N, feature_dim, CV_32F);

    memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float),
        THFloatTensor_data(desc),
        sizeof(float) * feature_dim * k);

    THFloatTensor_free(desc);
  }

  THCudaTensor_free(state, input);
  THFloatTensor_free(buffer);
}
Ejemplo n.º 6
0
void transfer_tensor_CUDA(THCState *state, THCudaTensor *dst, struct TensorWrapper srcWrapper) {

    THCudaTensor *src = reinterpret_cast<THCudaTensor *>(srcWrapper.tensorPtr);

    dst->nDimension = src->nDimension;
    dst->refcount = src->refcount;

    dst->storage = src->storage;

    if (!srcWrapper.definedInLua) {
        // Don't let Torch deallocate size and stride arrays
        dst->size = src->size;
        dst->stride = src->stride;
        src->size = nullptr;
        src->stride = nullptr;
        THAtomicIncrementRef(&src->storage->refcount);
        THCudaTensor_free(state, src);
    } else {
        dst->size   = static_cast<long *>(THAlloc(sizeof(long) * dst->nDimension));
        dst->stride = static_cast<long *>(THAlloc(sizeof(long) * dst->nDimension));
        memcpy(dst->size,   src->size,   src->nDimension * sizeof(long));
        memcpy(dst->stride, src->stride, src->nDimension * sizeof(long));
    }
}