THFloatTensor *nn_SpatialConvolutionMM_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; THFloatTensor *finput = module->SpatialConvolution.finput; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int batch = 1; if (input->nDimension == 3) { batch = 0; THFloatTensor_resize4d(input, 1, input->size[0], input->size[1], input->size[2]); } long batchSize = input->size[0]; long nInputPlane = module->SpatialConvolution.nInputPlane; long nOutputPlane = module->SpatialConvolution.nOutputPlane; long inputWidth = input->size[3]; long inputHeight = input->size[2]; long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; if (outputWidth < 1 || outputHeight < 1) THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small", nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); THFloatTensor_resize3d(finput, batchSize, kW*kH*nInputPlane, outputHeight*outputWidth); THFloatTensor_resize4d(output, batchSize, nOutputPlane, outputHeight, outputWidth); long t; #pragma omp parallel for if(batchSize >= 4) private(t) for (t = 0; t < batchSize; t++) { THFloatTensor *input_t = THFloatTensor_newSelect(input, 0, t); THFloatTensor *output_t = THFloatTensor_newSelect(output, 0, t); THFloatTensor *finput_t = THFloatTensor_newSelect(finput, 0, t); nn_SpatialConvolutionMM_updateOutput_frame(input_t, output_t, weight, bias, finput_t, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, nOutputPlane, outputWidth, outputHeight); THFloatTensor_free(input_t); THFloatTensor_free(output_t); THFloatTensor_free(finput_t); } if (batch == 0) { THFloatTensor_resize3d(output, nOutputPlane, outputHeight, outputWidth); THFloatTensor_resize3d(input, nInputPlane, inputHeight, inputWidth); } return output; }
int THProcessFloat(THNETWORK *network, float *data, int batchsize, int width, int height, float **result, int *outwidth, int *outheight) { int b, c, i; THFloatTensor *t = THFloatTensor_new(); THFloatTensor *out; t->nDimension = 4; t->size[0] = batchsize; t->size[1] = 3; t->size[2] = height; t->size[3] = width; t->stride[0] = 3 * width * height; t->stride[1] = width * height; t->stride[2] = width; t->stride[3] = 1; t->storage = THFloatStorage_newwithbuffer((float *)data); #pragma omp parallel for private(b, c, i) for(b = 0; b < batchsize; b++) for(c = 0; c < 3; c++) for(i = 0; i < width*height; i++) data[b * t->stride[0] + c * t->stride[1] + i] = (data[b * t->stride[0] + c * t->stride[1] + i] - network->mean[c]) / network->std[c]; #ifdef CUDNN if(network->net->cuda) { THFloatTensor *t2 = THCudaTensor_newFromFloatTensor(t); out = forward(network->net, t2); THFloatTensor_free(t2); if(network->out) THFloatTensor_free(network->out); network->out = THFloatTensor_newFromCudaTensor(out); out = network->out; } else #endif #ifdef OPENCL if(network->net->opencl) { THFloatTensor *t2 = THOpenCLTensor_newFromImageTensor(t); out = forward(network->net, t2); THFloatTensor_free(t2); if(network->out) THFloatTensor_free(network->out); network->out = THFloatTensor_newFromOpenCLImageTensor(out); out = network->out; } else #endif out = forward(network->net, t); THFloatTensor_free(t); *result = out->storage->data; if(out->nDimension >= 3) { *outwidth = out->size[out->nDimension - 1]; *outheight = out->size[out->nDimension - 2]; } else *outwidth = *outheight = 1; return THFloatTensor_nElement(out); }
void THMakeSpatial(THNETWORK *network) { int i, size = 231, nInputPlane = 3; for(i = 0; i < network->net->nelem; i++) { if(network->net->modules[i].type == MT_View || network->net->modules[i].type == MT_Reshape) { THFloatTensor_free(network->net->modules[i].output); memmove(network->net->modules+i, network->net->modules+i+1, sizeof(*network->net->modules) * (network->net->nelem - i - 1)); network->net->nelem--; i--; } else if(network->net->modules[i].type == MT_Linear) { THFloatTensor_free(network->net->modules[i].Linear.addBuffer); network->net->modules[i].updateOutput = nn_SpatialConvolutionMM_updateOutput; #ifndef USEBLAS network->net->modules[i].type = MT_SpatialConvolutionVirtMM; #else network->net->modules[i].type = MT_SpatialConvolutionMM; #endif struct SpatialConvolution *c = &network->net->modules[i].SpatialConvolution; c->finput = THFloatTensor_new(); c->padW = c->padH = 0; c->dW = c->dH = 1; c->kW = c->kH = size; c->nInputPlane = nInputPlane; nInputPlane = c->nOutputPlane = c->weight->size[0]; size = (size + 2*c->padW - c->kW) / c->dW + 1; } else if(network->net->modules[i].type == MT_SpatialConvolution || network->net->modules[i].type == MT_SpatialConvolutionMM || network->net->modules[i].type == MT_SpatialConvolutionVirtMM) { struct SpatialConvolution *c = &network->net->modules[i].SpatialConvolution; size = (size + 2*c->padW - c->kW) / c->dW + 1; nInputPlane = network->net->modules[i].SpatialConvolution.nOutputPlane; } else if(network->net->modules[i].type == MT_SpatialMaxPooling) { struct SpatialMaxPooling *c = &network->net->modules[i].SpatialMaxPooling; if(c->ceil_mode) size = (long)(ceil((float)(size - c->kH + 2*c->padH) / c->dH)) + 1; else size = (long)(floor((float)(size - c->kH + 2*c->padH) / c->dH)) + 1; } else if(network->net->modules[i].type == MT_SpatialZeroPadding) { struct SpatialZeroPadding *c = &network->net->modules[i].SpatialZeroPadding; size += c->pad_l + c->pad_r; } } }
int THProcessYUYV(THNETWORK *network, unsigned char *image, int width, int height, float **results, int *outwidth, int *outheight) { THFloatTensor *out; THFloatStorage *st; #ifdef CUDNN if(network->net->cuda) THError("This function is not supported with CUDNN"); #endif #ifdef OPENCL if(network->net->cuda) THError("This function is not supported with OpenCL"); #endif st = THFloatStorage_new(width * height * 3); yuyv2fRGB(image, st->data, width*height, width, width, height, network->mean, network->std); THFloatTensor *t = THFloatTensor_new(); t->storage = st; t->nDimension = 3; t->size[0] = 3; t->size[1] = height; t->size[2] = width; t->stride[0] = width * height; t->stride[1] = width; t->stride[2] = 1; out = forward(network->net, t); THFloatTensor_free(t); *results = out->storage->data; if(out->nDimension >= 3) { *outwidth = out->size[out->nDimension - 1]; *outheight = out->size[out->nDimension - 2]; } else *outwidth = *outheight = 1; return THFloatTensor_nElement(out); }
static void nn_SpatialConvolutionMM_updateOutput_frame(THFloatTensor *input, THFloatTensor *output, THFloatTensor *weight, THFloatTensor *bias, THFloatTensor *finput, int kW, int kH, int dW, int dH, int padW, int padH, long nInputPlane, long inputWidth, long inputHeight, long nOutputPlane, long outputWidth, long outputHeight) { nn_unfolded_copy(finput, input, kW, kH, dW, dH, padW, padH, nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight); THFloatTensor *output2d = THFloatTensor_newWithStorage2d(output->storage, output->storageOffset, nOutputPlane, -1, outputHeight*outputWidth, -1); long i; for (i = 0; i < nOutputPlane; i++) { float *data = output->storage->data + output->storageOffset + output->stride[0]*i; float what = bias->storage->data[i]; long len = outputHeight*outputWidth; THFloatVector_fill(data, what, len); } THFloatTensor_addmm(output2d, 1, output2d, 1, weight, finput); THFloatTensor_free(output2d); }
// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }
void THCudaTensor_copyFloat(THCudaTensor *self, struct THFloatTensor *src) { THArgCheck(THCudaTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match"); { THCudaTensor *selfc = THCudaTensor_newContiguous(self); src = THFloatTensor_newContiguous(src); THCudaCheck(cudaMemcpy(selfc->storage->data + selfc->storageOffset, src->storage->data + src->storageOffset, THFloatTensor_nElement(src) * sizeof(float), cudaMemcpyHostToDevice)); THFloatTensor_free(src); THCudaTensor_freeCopyTo(selfc, self); } }
void THFreeNetwork(THNETWORK *network) { freenetwork(network->net); if(network->netobj) { freeobject(network->netobj); free(network->netobj); } if(network->statobj) { freeobject(network->statobj); free(network->statobj); } if(network->out) THFloatTensor_free(network->out); free(network); }
THFloatTensor *forward(struct network *net, THFloatTensor *in) { int i; double t = 0, convtot = 0, convflops = 0; #ifdef OPENCL if(net->opencl == 1) OpenCL_Build(net, in); #endif for(i = 0; i < net->nelem; i++) { if(th_profile) t = th_seconds(); in = net->modules[i].updateOutput(&net->modules[i], in); // You can remove these lines if you don't have problems with memory // These lines free intermediate results if(i > 0) { THFloatTensor_free(net->modules[i-1].output); net->modules[i-1].output = THFloatTensor_new(); } if(th_profile) { #ifdef OPENCL if(net->opencl) clFinish(cl_queue); #endif t = th_seconds() - t; if(net->modules[i].type == MT_SpatialConvolutionMM || net->modules[i].type == MT_SpatialConvolutionVirtMM || net->modules[i].type == MT_SpatialConvolution) { double flops = 2.0 * THFloatTensor_nElement(in) * net->modules[i].SpatialConvolution.nInputPlane * net->modules[i].SpatialConvolution.kW * net->modules[i].SpatialConvolution.kH; printf("%f seconds for module %d, %f Gflops/s\n", t, i+1, flops * 1e-9 / t); convtot += t; convflops += flops; } else printf("%f seconds for module %d\n", t, i+1); } if(th_debug > 1) printf("%d) %d %d %ld %ld %ld %ld\n", i+1, net->modules[i].type, in->nDimension, in->size[0], in->size[1], in->size[2], in->size[3]); } if(th_profile) printf("%f seconds for convolutions %f Gflops/s\n", convtot, convflops * 1e-9 / convtot); return in; }
THFloatTensor *nn_View_updateOutput(struct module *module, THFloatTensor *input) { long nElements = THFloatTensor_nElement(input); long numElements = module->View.numElements; long batchSize = nElements / numElements; THFloatTensor *view; if (batchSize > 1) view = THFloatTensor_newWithStorage2d(input->storage, input->storageOffset, batchSize, numElements, numElements, 1); else view = THFloatTensor_newWithStorage1d(input->storage, input->storageOffset, numElements, 1); THFloatTensor_free(module->output); module->output = view; return module->output; }
THFloatTensor *forward(struct network *net, THFloatTensor *in) { int i; for(i = 0; i < net->nelem; i++) { in = net->modules[i].updateOutput(&net->modules[i], in); // You can remove these lines if you don't have problems with memory // These lines free intermediate results if(i > 0) { THFloatTensor_free(net->modules[i-1].output); net->modules[i-1].output = 0; } //printf("%d) %d %d %ld %ld %ld %ld\n", i+1, net->modules[i].type, in->nDimension, in->size[0], in->size[1], in->size[2], in->size[3]); } return in; }
int THProcessImages(THNETWORK *network, unsigned char **images, int batchsize, int width, int height, int stride, float **results, int *outwidth, int *outheight, int bgr) { int i; THFloatTensor *out, *t = 0; THFloatStorage *st; #ifdef CUDNN if(network->net->cuda) { #ifdef HAVEFP16 if(floattype == CUDNN_DATA_HALF) { st = THCudaStorage_new(batchsize * (width * height * 3)); for(i = 0; i < batchsize; i++) cuda_rgb2half((unsigned short *)st->data + i * (width * height * 3), images[i], width, height, stride, network->mean, network->std, bgr); } else #endif { st = THCudaStorage_new(batchsize * width * height * 3); for(i = 0; i < batchsize; i++) cuda_rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std, bgr); } } else #endif #ifdef OPENCL if(network->net->opencl) t = OpenCL_LoadImage(images[0], width, height, stride, network->mean, network->std, bgr); else #endif { st = THFloatStorage_new(batchsize * width * height * 3); if(bgr) #pragma omp parallel for if(batchsize>1) private(i) for(i = 0; i < batchsize; i++) bgr2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std); else #pragma omp parallel for if(batchsize>1) private(i) for(i = 0; i < batchsize; i++) rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std); } if(!t) { t = THFloatTensor_new(); t->storage = st; if(batchsize == 1) { t->nDimension = 3; t->size[0] = 3; t->size[1] = height; t->size[2] = width; t->stride[0] = width * height; t->stride[1] = width; t->stride[2] = 1; } else { t->nDimension = 4; t->size[0] = batchsize; t->size[1] = 3; t->size[2] = height; t->size[3] = width; t->stride[0] = 3 * width * height; t->stride[1] = width * height; t->stride[2] = width; t->stride[3] = 1; } } #ifdef CUDNN if(network->net->cuda) { out = forward(network->net, t); if(network->out) THFloatTensor_free(network->out); #ifdef HAVEFP16 if(floattype == CUDNN_DATA_HALF) network->out = THFloatTensor_newFromHalfCudaTensor(out); else #endif network->out = THFloatTensor_newFromCudaTensor(out); out = network->out; } else #endif #ifdef OPENCL if(network->net->opencl) { out = forward(network->net, t); if(network->out) THFloatTensor_free(network->out); #ifdef HAVEFP16 if(cl_datasize == 2) network->out = THFloatTensor_newFromHalfOpenCLImageTensor(out); else #endif network->out = THFloatTensor_newFromOpenCLImageTensor(out); out = network->out; } else #endif out = forward(network->net, t); THFloatTensor_free(t); *results = out->storage->data; if(out->nDimension >= 3) { *outwidth = out->size[out->nDimension - 1]; *outheight = out->size[out->nDimension - 2]; } else *outwidth = *outheight = 1; return THFloatTensor_nElement(out); }
int main(int argc, char **argv) { THNETWORK *net; float *result; int i, n = 0, rc, outwidth, outheight, runs = 1, print = 0, alg = 1, nbatch = 1; const char *modelsdir = 0, *inputfile = 0; for(i = 1; i < argc; i++) { if(argv[i][0] != '-') continue; switch(argv[i][1]) { case 'm': if(i+1 < argc) modelsdir = argv[++i]; break; case 'i': if(i+1 < argc) inputfile = argv[++i]; break; case 'a': if(i+1 < argc) alg = atoi(argv[++i]); break; case 'p': print = 1; break; case 'r': if(i+1 < argc) runs = atoi(argv[++i]); break; case 'b': if(i+1 < argc) { nbatch = atoi(argv[++i]); if(nbatch > 256 || nbatch < 1) nbatch = 256; } break; } } if(!modelsdir || !inputfile) { fprintf(stderr, "Syntax: test -m <models directory> -i <input file>\n"); fprintf(stderr, " [-r <number of runs] [-p(rint results)]\n"); fprintf(stderr, " [-a <alg=0:norm,1:MM (default),2:virtMM,3:cuDNN,4:cudNNhalf>]\n"); fprintf(stderr, " [-b <nbatch>]\n"); return -1; } if(alg == 4) { alg = 3; THCudaHalfFloat(1); } THInit(); net = THLoadNetwork(modelsdir); if(net) { THMakeSpatial(net); if(alg == 0) THUseSpatialConvolutionMM(net, 0); else if(alg == 1 || alg == 2) THUseSpatialConvolutionMM(net, alg); else if(alg == 3) { THNETWORK *net2 = THCreateCudaNetwork(net); if(!net2) THError("CUDA not compiled in"); THFreeNetwork(net); net = net2; } if(strstr(inputfile, ".t7")) { struct thobject input_o; rc = loadtorch(inputfile, &input_o, 8); if(!rc) { THFloatTensor *in = THFloatTensor_newFromObject(&input_o); // In CuDNN the first one has to do some initializations, so don't count it for timing if(alg == 3) THProcessFloat(net, in->storage->data, 1, in->size[2], in->size[1], &result, &outwidth, &outheight); t = seconds(); for(i = 0; i < runs; i++) n = THProcessFloat(net, in->storage->data, 1, in->size[2], in->size[1], &result, &outwidth, &outheight); t = (seconds() - t) / runs; THFloatTensor_free(in); freeobject(&input_o); } else printf("Error loading %s\n", inputfile); } else { img_t image; rc = loadimage(inputfile, &image); if(!rc) { unsigned char *bitmaps[256]; for(i = 0; i < nbatch; i++) bitmaps[i] = image.bitmap; // In CuDNN the first one has to do some initializations, so don't count it for timing if(alg == 3) THProcessImages(net, &image.bitmap, 1, image.width, image.height, 3*image.width, &result, &outwidth, &outheight, 0); t = seconds(); for(i = 0; i < runs; i++) n = THProcessImages(net, bitmaps, nbatch, image.width, image.height, 3*image.width, &result, &outwidth, &outheight, 0); t = (seconds() - t) / runs; #ifdef USECUDAHOSTALLOC cudaFreeHost(image.bitmap); #else free(image.bitmap); #endif } else printf("Error loading image %s\n", inputfile); } if(print) for(i = 0; i < n; i++) printf("(%d,%d,%d): %f\n", i/(outwidth*outheight), i % (outwidth*outheight) / outwidth, i % outwidth, result[i]); printf("1 run processing time: %lf\n", t); THFreeNetwork(net); } else printf("The network could not be loaded: %d\n", THLastError()); #ifdef MEMORYDEBUG debug_memorydump(stderr); #endif return 0; }
void THFloatTensor_addmm(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *m1, THFloatTensor *m2) { char transpose_r, transpose_m1, transpose_m2; THFloatTensor *r__, *m1_, *m2_; if( (m1->nDimension != 2) || (m2->nDimension != 2)) THError("matrices expected, got %dD, %dD tensors", m1->nDimension, m2->nDimension); if(m1->size[1] != m2->size[0]) THError("size mismatch, m1: %ld, m2: %ld", m1->size[1], m2->size[0]); if( t->nDimension != 2 ) THError("matrix expected, got %dD tensor for t", t->nDimension); if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) THError("size mismatch, t: %ld, m1: %ld, t: %ld, m2: %ld", t->size[0], m1->size[1], t->size[1], m2->size[1]); if(t != r_) THError("Not implemented: t != r"); /* printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */ /* r_ */ if(r_->stride[0] == 1 && r_->stride[1] != 0) { transpose_r = 'n'; r__ = r_; } else if(r_->stride[1] == 1 && r_->stride[0] != 0) { THFloatTensor *swap = m2; m2 = m1; m1 = swap; transpose_r = 't'; r__ = r_; } else { THError("Transpose not implemented (1)"); return; /* transpose_r = 'n'; r__ = THFloatTensor_newWithSize2d(r_->size[1], r_->size[0]); THFloatTensor_copy(r__, r_); THFloatTensor_transpose(r__, NULL, 0, 1);*/ } /* m1 */ if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) { transpose_m1 = 'n'; m1_ = m1; } else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) { transpose_m1 = 't'; m1_ = m1; } else { THError("Transpose not implemented (2)"); return; /*transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); m1_ = THFloatTensor_newContiguous(m1);*/ } /* m2 */ if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) { transpose_m2 = 'n'; m2_ = m2; } else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) { transpose_m2 = 't'; m2_ = m2; } else { THError("Transpose not implemented (3)"); return; /*transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); m2_ = THFloatTensor_(newContiguous)(m2);*/ } /* do the operation */ THBlas_gemm(transpose_m1, transpose_m2, r__->size[(transpose_r == 'n' ? 0 : 1)], r__->size[(transpose_r == 'n' ? 1 : 0)], m1_->size[(transpose_r == 'n' ? 1 : 0)], alpha, THFloatTensor_data(m1_), (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), THFloatTensor_data(m2_), (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), beta, THFloatTensor_data(r__), r__->stride[(transpose_r == 'n' ? 1 : 0)]); /* free intermediate variables */ if(m1_ != m1) THFloatTensor_free(m1_); if(m2_ != m2) THFloatTensor_free(m2_); if(r__ != r_) THError("freeCopyTo not implemented"); /*THFloatTensor_(freeCopyTo)(r__, r_);*/ }