// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }
static int luafunc_load(lua_State *L) { THFloatTensor *t = 0; const char *tname = luaT_typename(L, 1); int i, index = lua_tointeger(L, 2); if(max == 0) luaL_error(L, "fastimage.init: call init first"); if(index > nsizes) luaL_error(L, "Invalid size index %d", index); index--; if(index < 0) index = 0; if(tname && !strcmp(tname, "torch.FloatTensor")) { t = luaT_toudata(L, 1, luaT_typenameid(L, "torch.FloatTensor")); if(t->nDimension == 4 && t->size[1] == 3) { if(nsizes == 1) { sizes[0].width = t->size[3]; sizes[0].height = t->size[2]; max = t->size[0]; } else if(sizes[0].width != t->size[3] || sizes[0].height != t->size[2] || max != t->size[0]) t = 0; } else t = 0; } if(!index) { for(i = 0; i < max; i++) if(images[i].bitmap) { free(images[i].bitmap); images[i].bitmap = 0; } for(i = 0; i < max; i++) { if(loadnextimage(images + i)) break; } if(i == 0) { lprintf("Nothing found\n"); return 0; } if(i < max) { max = i; if(t) t = THFloatTensor_newNarrow(t, 0, 0, i); } } for(i = 0; i < max; i++) { if(nsizes == 1 && (!sizes[0].width || !sizes[0].height)) { lprintf("Set width = %d, height = %d\n", images[i].width, images[i].height); sizes[0].width = images[i].width; sizes[0].height = images[i].height; } if(!t) t = THFloatTensor_newWithSize4d(max, 3, sizes[index].height, sizes[index].width); uint8_t *rescaled = scale(images + i, sizes[index].width, sizes[index].height); rgb_tofloat(THFloatTensor_data(t) + i * t->stride[0], t->stride[1], t->stride[2], rescaled, sizes[index].width, sizes[index].height); if(rescaled != images[i].bitmap) free(rescaled); if(nsizes == 1 && images[i].bitmap) { // It's not necessary to keep all the images in memory, if there is only one size free(images[i].bitmap); images[i].bitmap = 0; } } lprintf("%d x 3 x %d x %d tensor returned\n", i, sizes[index].height, sizes[index].width); luaT_pushudata(L, t, "torch.FloatTensor"); lua_createtable(L, max, 0); for(i = 0; i < max; i++) { lua_pushinteger(L, i+1); lua_createtable(L, 0, 3); lua_pushstring(L, "filename"); lua_pushstring(L, images[i].filename); lua_settable(L, -3); lua_pushstring(L, "width"); lua_pushinteger(L, images[i].width); lua_settable(L, -3); lua_pushstring(L, "height"); lua_pushinteger(L, images[i].height); lua_settable(L, -3); lua_settable(L, -3); } return 2; }