/* now we overwrite some methods specific to CudaTensor */ static int cutorch_CudaTensor_copy(lua_State *L) { THCState *state = cutorch_getstate(L); THCudaTensor *storage = luaT_checkudata(L, 1, "torch.CudaTensor"); void *src; if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) ) THCudaTensor_copy(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.ByteTensor")) ) THCudaTensor_copyByte(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CharTensor")) ) THCudaTensor_copyChar(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.ShortTensor")) ) THCudaTensor_copyShort(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.IntTensor")) ) THCudaTensor_copyInt(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.LongTensor")) ) THCudaTensor_copyLong(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.FloatTensor")) ) THCudaTensor_copyFloat(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.DoubleTensor")) ) THCudaTensor_copyDouble(state, storage, src); else if( (src = luaT_toudata(L, 2, "torch.CudaTensor")) ) THCudaTensor_copyCuda(state, storage, src); else luaL_typerror(L, 2, "torch.*Tensor"); lua_settop(L, 1); return 1; }
// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }