/* * Based on the implementation of the THTensor_(indexCopy) in torch7 */ static void THCudaTensor_indexCopy(THCudaTensor *tensor, int dim, THLongTensor *index, THCudaTensor *src) { long i, numel; THCudaTensor *tSlice, *sSlice; long *index_data; numel = THLongTensor_nElement(index); THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < src->nDimension,4,"Indexing dim is out of bounds"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i<numel; i++) { if (tensor->nDimension > 1 ) { tSlice = THCudaTensor_new(); sSlice = THCudaTensor_new(); THCudaTensor_select(tSlice, tensor, dim, index_data[i]-1); THCudaTensor_select(sSlice, src, dim, i); THCudaTensor_copy(tSlice, sSlice); THCudaTensor_free(tSlice); THCudaTensor_free(sSlice); } else { // It's faster to copy a float from an address in the device to another address in the device than // retrieving it to the host memory and recopy it to the device memory THCudaCheck(cudaMemcpy(tensor->storage->data + tensor->storageOffset + index_data[i]-1,\ src->storage->data + src->storageOffset + i, sizeof(float), cudaMemcpyDeviceToDevice)); } } THLongTensor_free(index); }
void THCudaTensor_freeCopyTo(THCState *state, THCudaTensor *self, THCudaTensor *dst) { if(self != dst) THCudaTensor_copy(state, dst, self); THCudaTensor_free(state, self); }
/* * Based on the implementation of the THTensor_(indexCopy) in torch7 */ static void THCudaTensor_indexFill(THCudaTensor *tensor, int dim, THLongTensor *index, float val) { long i, numel; THCudaTensor *tSlice; long *index_data; numel = THLongTensor_nElement(index); THArgCheck(index->nDimension == 1, 3, "Index is supposed to be a vector"); THArgCheck(dim < tensor->nDimension,4,"Indexing dim is out of bounds"); index = THLongTensor_newContiguous(index); index_data = THLongTensor_data(index); for (i=0; i<numel; i++) { if (tensor->nDimension > 1 ) { // create a new CudaTensor tSlice = THCudaTensor_new(); // set its storage to point to the corresponding storage in tensor THCudaTensor_select(tSlice, tensor,dim,index_data[i]-1); THCudaTensor_fill(tSlice, val); THCudaTensor_free(tSlice); } else { THCudaTensor_set1d(tensor,index_data[i]-1,val); } } THLongTensor_free(index); }
void THFloatTensor_copyCuda(THFloatTensor *self, struct THCudaTensor *src) { THArgCheck(THFloatTensor_nElement(self) == THCudaTensor_nElement(src), 2, "sizes do not match"); { THFloatTensor *selfc = THFloatTensor_newContiguous(self); src = THCudaTensor_newContiguous(src); THCudaCheck(cudaMemcpy(selfc->storage->data + selfc->storageOffset, src->storage->data + src->storageOffset, THCudaTensor_nElement(src) * sizeof(float), cudaMemcpyDeviceToHost)); THCudaTensor_free(src); THFloatTensor_freeCopyTo(selfc, self); } }
// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }
void transfer_tensor_CUDA(THCState *state, THCudaTensor *dst, struct TensorWrapper srcWrapper) { THCudaTensor *src = reinterpret_cast<THCudaTensor *>(srcWrapper.tensorPtr); dst->nDimension = src->nDimension; dst->refcount = src->refcount; dst->storage = src->storage; if (!srcWrapper.definedInLua) { // Don't let Torch deallocate size and stride arrays dst->size = src->size; dst->stride = src->stride; src->size = nullptr; src->stride = nullptr; THAtomicIncrementRef(&src->storage->refcount); THCudaTensor_free(state, src); } else { dst->size = static_cast<long *>(THAlloc(sizeof(long) * dst->nDimension)); dst->stride = static_cast<long *>(THAlloc(sizeof(long) * dst->nDimension)); memcpy(dst->size, src->size, src->nDimension * sizeof(long)); memcpy(dst->stride, src->stride, src->nDimension * sizeof(long)); } }