// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }
THCudaTensor *THCudaTensor_newWithSize3d(THCState *state, long size0, long size1, long size2) { return THCudaTensor_newWithSize4d(state, size0, size1, size2, -1); }
THCudaTensor *THCudaTensor_newWithSize1d(THCState *state, long size0) { return THCudaTensor_newWithSize4d(state, size0, -1, -1, -1); }