int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) { //Grab the input tensor float * data_flat = THFloatTensor_data(features); float * rois_flat = THFloatTensor_data(rois); float * output_flat = THFloatTensor_data(output); // Number of ROIs int num_rois = THFloatTensor_size(rois, 0); int size_rois = THFloatTensor_size(rois, 1); if (size_rois != 5) { return 0; } // data height int data_height = THFloatTensor_size(features, 2); // data width int data_width = THFloatTensor_size(features, 3); // Number of channels int num_channels = THFloatTensor_size(features, 1); // do ROIAlignForward ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, aligned_height, aligned_width, rois_flat, output_flat); return 1; }
void THFloatTensor_convmm(THFloatTensor *r, float beta, float alpha, THFloatTensor *filt, THFloatTensor *m, int kH, int kW, int dH, int dW, int padH, int padW) { struct sgemmargs args; args.transa = 0; args.transb = 0; args.m = r->size[1] * r->size[2]; args.n = r->size[0]; args.k = filt->size[1]; args.alpha = alpha; args.beta = beta; args.lda = m->stride[0]; args.ldb = filt->stride[0]; args.ldc = r->stride[0]; args.a = THFloatTensor_data(m); args.b = THFloatTensor_data(filt); args.c = THFloatTensor_data(r); args.ks0 = kH * kW; args.ks1 = kW; args.is0 = m->stride[0]; args.is1 = m->stride[1]; args.ih = m->size[1]; args.os0 = r->stride[0]; args.os1 = r->stride[1]; args.dW = dW; args.dH = dH; args.padW = padW; args.padH = padH; sgemmargs(&args); }
static int cuda_FloatTensor_fakecopy(lua_State *L) { THFloatTensor *self = luaT_checkudata(L, 1, "torch.FloatTensor"); THFloatTensor *src = luaT_checkudata(L, 2, "torch.FloatTensor"); long *d_self_sz, *d_self_st, *d_src_sz, *d_src_st; long nElement = THFloatTensor_nElement(self); THArgCheck(THFloatTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match"); THFloatTensor_computesz(self, &d_self_sz, &d_self_st); THFloatTensor_computesz(src, &d_src_sz, &d_src_st); THFloatTensor_kernel_copy(THFloatTensor_data(self), d_self_sz, d_self_st, self->nDimension, THFloatTensor_data(src), d_src_sz, d_src_st, src->nDimension, nElement); THFree(d_self_sz); THFree(d_self_st); THFree(d_src_sz); THFree(d_src_st); lua_settop(L, 1); return 1; }
int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad) { //Grab the input tensor float * top_grad_flat = THFloatTensor_data(top_grad); float * rois_flat = THFloatTensor_data(rois); float * bottom_grad_flat = THFloatTensor_data(bottom_grad); // Number of ROIs int num_rois = THFloatTensor_size(rois, 0); int size_rois = THFloatTensor_size(rois, 1); if (size_rois != 5) { return 0; } // batch size // int batch_size = THFloatTensor_size(bottom_grad, 0); // data height int data_height = THFloatTensor_size(bottom_grad, 2); // data width int data_width = THFloatTensor_size(bottom_grad, 3); // Number of channels int num_channels = THFloatTensor_size(bottom_grad, 1); // do ROIAlignBackward ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height, data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat); return 1; }
THFloatTensor *cudnn_SpatialMaxPooling_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialMaxPooling.kW; int kH = module->SpatialMaxPooling.kH; int dW = module->SpatialMaxPooling.dW; int dH = module->SpatialMaxPooling.dH; int padW = module->SpatialMaxPooling.padW; int padH = module->SpatialMaxPooling.padH; THFloatTensor *output = module->output; cudnnTensorDescriptor_t dinput, doutput; cudnnPoolingDescriptor_t dpool; float one = 1, zero = 0; int sizes[4]; errcheck(THcudnn_TensorDescriptor(&dinput, input)); errcheck(cudnnCreatePoolingDescriptor(&dpool)); errcheck(cudnnSetPooling2dDescriptor(dpool, CUDNN_POOLING_MAX, kH, kW, padH, padW, dH, dW)); errcheck(cudnnGetPoolingNdForwardOutputDim(dpool, dinput, 4, sizes)); THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); errcheck(cudnnPoolingForward(THcudnn_getHandle(), dpool, &one, dinput, THFloatTensor_data(input), &zero, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyTensorDescriptor(doutput); cudnnDestroyPoolingDescriptor(dpool); return output; }
void THFloatTensor_mul(THFloatTensor *r_, THFloatTensor *t, float value) { float *tp = THFloatTensor_data(t); float *rp = THFloatTensor_data(r_); long i; long sz = THFloatTensor_nElement(t); #pragma omp parallel for if(sz > TH_OMP_OVERHEAD_THRESHOLD) private(i) for (i=0; i<sz; i++) rp[i] = tp[i] * value; }
static int dist_smr(lua_State * L) { // get args const void* torch_FloatTensor_id = luaT_checktypename2id(L, "torch.FloatTensor"); THFloatTensor *output_ptr = luaT_checkudata(L, 1, torch_FloatTensor_id); THFloatTensor *input_ptr = luaT_checkudata(L, 2, torch_FloatTensor_id); THFloatTensor *kernel_ptr = luaT_checkudata(L, 3, torch_FloatTensor_id); float dynamic = lua_tonumber(L, 4); int begin_x = lua_tonumber(L, 5); int end_x = lua_tonumber(L, 6); int begin_y = lua_tonumber(L, 7); int end_y = lua_tonumber(L, 8); // get raw pointers float *output = THFloatTensor_data(output_ptr); float *input = THFloatTensor_data(input_ptr); float *kernel = THFloatTensor_data(kernel_ptr); // dims int kheight = kernel_ptr->size[0]; int kwidth = kernel_ptr->size[1]; //strides long *is = input_ptr->stride; long *ks = kernel_ptr->stride; long *os = output_ptr->stride; // similarity matching ratio (SMR) int i, j, x, y, pos; float probability; float distance; for(y = begin_y; y < end_y; y++) { for(x = begin_x; x < end_x; x++) { pos = y*is[0]+x*is[1]; probability = 0; for(j=0; j< kheight; j++) { for(i=0; i< kwidth; i++) { distance = abs(input[ pos+j*is[0]+i*is[1] ]- kernel[ j*ks[0]+i*ks[1] ]); if (distance<dynamic/2) probability = probability + exp(-2*distance); } } output[y*os[0]+x*os[1]] = probability; } } lua_newtable(L); // result = {} int result = lua_gettop(L); return 0; }
// frame grabber static int l_grabFrame (lua_State *L) { // Get Tensor's Info const int idx = lua_tonumber(L, 1); THFloatTensor * tensor = luaT_checkudata(L, 2, luaT_checktypename2id(L, "torch.FloatTensor")); // grab frame frame[idx] = cvQueryFrame ( capture[idx] ); if( !frame[idx] ) { perror("could not query OpenCV capture"); } // resize given tensor THFloatTensor_resize3d(tensor, 3, frame[idx]->height, frame[idx]->width); // copy to tensor int m0 = tensor->stride[1]; int m1 = tensor->stride[2]; int m2 = tensor->stride[0]; unsigned char *src = frame[idx]->imageData; float *dst = THFloatTensor_data(tensor); int i, j, k; for (i=0; i < frame[idx]->height; i++) { for (j=0, k=0; j < frame[idx]->width; j++, k+=m1) { // red: dst[k] = src[i*frame[idx]->widthStep + j*frame[idx]->nChannels + 2]/255.; // green: dst[k+m2] = src[i*frame[idx]->widthStep + j*frame[idx]->nChannels + 1]/255.; // blue: dst[k+2*m2] = src[i*frame[idx]->widthStep + j*frame[idx]->nChannels + 0]/255.; } dst += m0; } return 0; }
static void load_array_to_lua(lua_State *L, chtk::htkarray& arr){ int ndims = 2; //based on code from mattorch with stride fix int k; THLongStorage *size = THLongStorage_newWithSize(ndims); THLongStorage *stride = THLongStorage_newWithSize(ndims); THLongStorage_set(size,0 , arr.nsamples); THLongStorage_set(size,1,arr.samplesize/4*(2*arr.frm_ext+1)); THLongStorage_set(stride,1,1); THLongStorage_set(stride,0,arr.samplesize/4*(2*arr.frm_ext+1)); void * tensorDataPtr = NULL; size_t numBytes = 0; THFloatTensor *tensor = THFloatTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THFloatTensor_data(tensor)); numBytes = THFloatTensor_nElement(tensor) * 4; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.FloatTensor")); // now copy the data assert(tensorDataPtr); memcpy(tensorDataPtr, (void *)(arr.data<void>()), numBytes); }
// Copy extracted patches to CUDA memory and run the network // One has to keep mind that GPU memory is limited and extracting too many patches // at once might cause troubles // So if you need to extract a lot of patches, an efficient way would be to // devide the set in smaller equal parts and preallocate CPU and GPU memory void extractDescriptors(THCState *state, cunn::Sequential::Ptr net, const std::vector<cv::Mat>& patches, cv::Mat& descriptors) { size_t batch_size = 128; size_t N = patches.size(); THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M); THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M); for(int j=0; j < ceil((float)N/batch_size); ++j) { float *data = THFloatTensor_data(buffer); size_t k = 0; for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k) memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M); // initialize 4D CUDA tensor and copy patches into it THCudaTensor_copyFloat(state, input, buffer); // propagate through the network THCudaTensor *output = net->forward(input); // copy descriptors back THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]); THFloatTensor_copyCuda(state, desc, output); size_t feature_dim = output->size[1]; if(descriptors.cols != feature_dim || descriptors.rows != N) descriptors.create(N, feature_dim, CV_32F); memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float), THFloatTensor_data(desc), sizeof(float) * feature_dim * k); THFloatTensor_free(desc); } THCudaTensor_free(state, input); THFloatTensor_free(buffer); }
void THFloatTensor_addmv(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *mat, THFloatTensor *vec) { if( (mat->nDimension != 2) || (vec->nDimension != 1) ) THError("matrix and vector expected, got %dD, %dD", mat->nDimension, vec->nDimension); if( mat->size[1] != vec->size[0] ) THError("size mismatch, %s, %s", mat->size[1], vec->size[0]); if(t->nDimension != 1) THError("vector expected, got t: %dD", t->nDimension); if(t->size[0] != mat->size[0]) THError("size mismatch, t: %ld, mat: %ld", t->size[0], mat->size[0]); if(r_ != t) THError("r_ != t not implemented"); if(mat->stride[0] == 1) { THBlas_gemv('n', mat->size[0], mat->size[1], alpha, THFloatTensor_data(mat), mat->stride[1], THFloatTensor_data(vec), vec->stride[0], beta, THFloatTensor_data(r_), r_->stride[0]); } else if(mat->stride[1] == 1) { THBlas_gemv('t', mat->size[1], mat->size[0], alpha, THFloatTensor_data(mat), mat->stride[0], THFloatTensor_data(vec), vec->stride[0], beta, THFloatTensor_data(r_), r_->stride[0]); } else THError("addmv for non-contiguous not implemented"); }
void THFloatTensor_addr(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *vec1, THFloatTensor *vec2) { if( (vec1->nDimension != 1) || (vec2->nDimension != 1) ) THError("vector and vector expected, got %dD, %dD tensors", vec1->nDimension, vec2->nDimension); if(t->nDimension != 2) THError("expected matrix, got %dD tensor for t", t->nDimension); if( (t->size[0] != vec1->size[0]) || (t->size[1] != vec2->size[0]) ) THError("size mismatch, t: %ld, vec1: %ld, t: %ld, vec2: %ld", t->size[0], vec1->size[0], t->size[1], vec2->size[0]); if(r_ != t) THError("r_ != t not implemented"); if(beta != 1) THFloatTensor_mul(r_, r_, beta); if(r_->stride[0] == 1) { THBlas_ger(vec1->size[0], vec2->size[0], alpha, THFloatTensor_data(vec1), vec1->stride[0], THFloatTensor_data(vec2), vec2->stride[0], THFloatTensor_data(r_), r_->stride[1]); } else if(r_->stride[1] == 1) { THBlas_ger(vec2->size[0], vec1->size[0], alpha, THFloatTensor_data(vec2), vec2->stride[0], THFloatTensor_data(vec1), vec1->stride[0], THFloatTensor_data(r_), r_->stride[0]); } else THError("addr for non-contiguous not implemented"); }
THFloatTensor *cudnn_Threshold_updateOutput(struct module *module, THFloatTensor *input) { THFloatTensor *output = module->output; cudnnTensorDescriptor_t dinput, doutput; int inplace = module->Threshold.inplace; float one = 1, zero = 0; errcheck(THcudnn_TensorDescriptor(&dinput, input)); if(inplace) THFloatTensor_set(output, input); else THCudaTensor_resize4d(output, input->size[0], input->size[1], input->size[2], input->size[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); errcheck(cudnnActivationForward(THcudnn_getHandle(), CUDNN_ACTIVATION_RELU, &one, dinput, THFloatTensor_data(input), &zero, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyTensorDescriptor(doutput); return output; }
static int parse(lua_State *L) { const char* id = luaT_typenameid(L, "torch.FloatTensor"); //Get float THFloatTensor *tensor = (THFloatTensor*) luaT_checkudata(L, 1, id); //Check if float float *input_data = THFloatTensor_data(tensor); //Pointer to tensor region float threshold = lua_tonumber(L, 2); //Threshold sent by lua int table_blobs = 3; int idx = lua_objlen(L, 3) + 1; float scale = lua_tonumber(L, 4); //Which scale was this called for? // loop over pixels int x,y; for (y=0; y<tensor->size[0]; y++) { for (x=0; x<tensor->size[1]; x++) { float val = THFloatTensor_get2d(tensor, y, x); if (val > threshold) { // entry = {} lua_newtable(L); int entry = lua_gettop(L); // entry[1] = x lua_pushnumber(L, x); lua_rawseti(L, entry, 1); // entry[2] = y lua_pushnumber(L, y); lua_rawseti(L, entry, 2); // entry[3] = scale lua_pushnumber(L, scale); lua_rawseti(L, entry, 3); // blobs[idx] = entry; idx = idx + 1 lua_rawseti(L, table_blobs, idx++); } } } return 1; }
static void load_array_to_lua(lua_State *L, cnpy::NpyArray& arr){ int ndims = arr.shape.size(); //based on code from mattorch with stride fix int k; THLongStorage *size = THLongStorage_newWithSize(ndims); THLongStorage *stride = THLongStorage_newWithSize(ndims); for (k=0; k<ndims; k++) { THLongStorage_set(size, k, arr.shape[k]); if (k > 0) THLongStorage_set(stride, ndims-k-1, arr.shape[ndims-k]*THLongStorage_get(stride,ndims-k)); else THLongStorage_set(stride, ndims-k-1, 1); } void * tensorDataPtr = NULL; size_t numBytes = 0; if ( arr.arrayType == 'f' ){ // float32/64 if ( arr.word_size == 4 ){ //float32 THFloatTensor *tensor = THFloatTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THFloatTensor_data(tensor)); numBytes = THFloatTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.FloatTensor")); }else if ( arr.word_size == 8){ //float 64 THDoubleTensor *tensor = THDoubleTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THDoubleTensor_data(tensor)); numBytes = THDoubleTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.DoubleTensor")); } }else if ( arr.arrayType == 'i' || arr.arrayType == 'u' ){ // does torch have unsigned types .. need to look if ( arr.word_size == 1 ){ //int8 THByteTensor *tensor = THByteTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THByteTensor_data(tensor)); numBytes = THByteTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ByteTensor")); }else if ( arr.word_size == 2 ){ //int16 THShortTensor *tensor = THShortTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THShortTensor_data(tensor)); numBytes = THShortTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.ShortTensor")); }else if ( arr.word_size == 4 ){ //int32 THIntTensor *tensor = THIntTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THIntTensor_data(tensor)); numBytes = THIntTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.IntTensor")); }else if ( arr.word_size == 8){ //long 64 THLongTensor *tensor = THLongTensor_newWithSize(size, stride); tensorDataPtr = (void *)(THLongTensor_data(tensor)); numBytes = THLongTensor_nElement(tensor) * arr.word_size; luaT_pushudata(L, tensor, luaT_checktypename2id(L, "torch.LongTensor")); } }else{ printf("array type unsupported"); throw std::runtime_error("unsupported data type"); } // now copy the data assert(tensorDataPtr); memcpy(tensorDataPtr, (void *)(arr.data<void>()), numBytes); }
JNIEXPORT float JNICALL Java_com_torchandroid_facedemo_CameraClass_callTorch(JNIEnv *env, jobject thiz, jlong torchStateLocation, jint width, jint height, jbyteArray NV21FrameData, jintArray outPixels) { lua_State *L = (lua_State*) torchStateLocation; float netProfiler = 0; THFloatTensor *testTensor = THFloatTensor_newWithSize1d(1280*768); //Initialize 1D tensor. jbyte *testTensor_data; //Initialize tensor to store java byte data from camera. testTensor_data = (env)->GetByteArrayElements(NV21FrameData,0); //Get pointer to java byte array region int imSize = 1280*768; //Define number of pixels jfloat *poutPixels = THFloatTensor_data(testTensor); //Torch tensor type to int jint *output = env->GetIntArrayElements(outPixels, 0); //Get java int array region for output //This loop ignores U and V channels. Network doesn't use them //Cam data comes like so - YYYYYY ... imSize times.... YYYYY UVUVUVUVUVUV.... <- ignore these for(int i = 0; i < imSize; i++) { output[i] = 0; poutPixels[i] = testTensor_data[i] & 0xFF; } int tableSize = 0; //Holds number of detections int *fill; lua_getglobal(L,"getDetections"); lua_getglobal(L,"network"); luaT_pushudata(L,testTensor,"torch.FloatTensor"); //Push tensor to lua stack lua_pushnumber(L,width); lua_pushnumber(L,height); if(lua_pcall(L,4,3,0) != 0) //Call function. Print error if call not successful __android_log_print(ANDROID_LOG_INFO, "Torchandroid", "Error running function: %s",lua_tostring(L, -1)); else { netProfiler = (float) lua_tonumber(L,-1); lua_pop(L,1); tableSize = lua_tointeger(L,-1); //Get #detections from stack lua_pop(L,1); if(tableSize != 0) //Extract x,y,w,h for each detection { fill = (int*) malloc(4*tableSize*sizeof(int)); //Holds detections PrintTable(L,tableSize,fill); } } if(tableSize != 0) { int center[2] = {0}; for(int i = 0; i < 4*tableSize; i+=4) { int x = fill[i]; int y = fill[i+1]; int w = fill[i+2]; int h = fill[i+3]; for(int j = i+4; j < 4*tableSize; j+=4) { center[0] = fill[j]+fill[j+2]*0.5; //x center center[1] = fill[j+1]+fill[j+3]*0.5; //y center if(((center[0] <= (x+w)) && (center[0] >= x)) && ((center[1] <= (y+h)) && (center[1] >= y))) { fill[j+2] = 0; } } } } if(tableSize != 0) { int tempnum2 = 2*1280; int tempnum3 = 3*1280; int jlim = 0; //Define to prevent computation of loop control for each iteration. Efficiency FTW int tempnum1 = 0; int center[2] = {0}; //Holds center of box xy for(int i = 0; i < 4*tableSize; i+=4) { if(fill[i+2] == 0) continue; int x = fill[i]; int y = fill[i+1]; int w = fill[i+2]; int h = fill[i+3]; int tempnum2 = h*2*1280; int tempnum3 = h*3*1280; __android_log_print(ANDROID_LOG_INFO, "Torchandroid", "x = %u y = %u w = %u h = %u",x,y,w,h); jlim = ((y-1)*1280+x+w); tempnum1 = 1280*(h-1); //Assign output pixels red color. 4 byte - ARGB. x,y from network in 2D. Convert to 1 D for(int j = (y-1)*1280+x; j < jlim; j++) //This loop does top and bottom lines of box { output[j-1280] = 0xFFFF0000; output[j+1280] = 0xFFFF0000; output[j] = 0xFFFF0000; output[j+tempnum1] = 0xFFFF0000; output[j+tempnum1-1280] = 0xFFFF0000; output[j+tempnum1+1280] = 0xFFFF0000; } jlim = (((y-1)*1280+x)+((1280*h)+w)); for(int j = (y-1)*1280+x; j < jlim; j+=1280) //This loop does left and right of box { output[j+1] = 0xFFFF0000; output[j-1] = 0xFFFF0000; output[j] = 0xFFFF0000; output[j+w] = 0xFFFF0000; output[j+w+1] = 0xFFFF0000; output[j+w-1] = 0xFFFF0000; } } } env->ReleaseByteArrayElements(NV21FrameData, testTensor_data, 0); //Destroy pointer to location in C. Only need java now env->ReleaseIntArrayElements(outPixels, output, 0); //Same as above here return netProfiler; }
static void nn_unfolded_copy(THFloatTensor *finput, THFloatTensor *input, int kW, int kH, int dW, int dH, int padW, int padH, int nInputPlane, int inputWidth, int inputHeight, int outputWidth, int outputHeight) { long k; float *input_data = THFloatTensor_data(input); float *finput_data = THFloatTensor_data(finput); #pragma omp parallel for private(k) for(k = 0; k < nInputPlane*kH*kW; k++) { long nip = k / (kH*kW); long rest = k % (kH*kW); long kh = rest / kW; long kw = rest % kW; long x,y; long long ix,iy; float *dst = finput_data + nip*(kH*kW*outputHeight*outputWidth) + kh*(kW*outputHeight*outputWidth) + kw*(outputHeight*outputWidth); float *src = input_data + nip*(inputHeight*inputWidth); if (padW > 0 || padH > 0) { long lpad,rpad; for(y = 0; y < outputHeight; y++) { iy = (long long)(y*dH - padH + kh); if (iy < 0 || iy >= inputHeight) { memset(dst+y*outputWidth, 0, sizeof(float)*outputWidth); } else { if (dW==1){ ix = (long long)(0 - padW + kw); lpad = fmaxf(0,padW-kw); rpad = fmaxf(0,padW-(kW-kw-1)); if (outputWidth-rpad-lpad <= 0) { memset(dst+(y*outputWidth), 0, sizeof(float)*outputWidth); } else { if (lpad > 0) memset(dst+y*outputWidth, 0, sizeof(float)*lpad); memcpy(dst+(y*outputWidth+lpad), src+(iy*inputWidth+ix+lpad), sizeof(float)*(outputWidth-rpad-lpad)); if (rpad > 0) memset(dst+y*outputWidth + outputWidth - rpad, 0, sizeof(float)*rpad); } } else{ for (x=0; x<outputWidth; x++){ ix = (long long)(x*dW - padW + kw); if (ix < 0 || ix >= inputWidth) memset(dst+(y*outputWidth+x), 0, sizeof(float)*1); else memcpy(dst+(y*outputWidth+x), src+(iy*inputWidth+ix), sizeof(float)*(1)); } } } } } else { for(y = 0; y < outputHeight; y++) { iy = (long long)(y*dH + kh); ix = (long long)(0 + kw); if (dW == 1) memcpy(dst+(y*outputWidth), src+(iy*inputWidth+ix), sizeof(float)*outputWidth); else{ for (x=0; x<outputWidth; x++) memcpy(dst+(y*outputWidth+x), src+(iy*inputWidth+ix+x*dW), sizeof(float)*(1)); } } } } }
void THFloatTensor_addmm(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *m1, THFloatTensor *m2) { char transpose_r, transpose_m1, transpose_m2; THFloatTensor *r__, *m1_, *m2_; if( (m1->nDimension != 2) || (m2->nDimension != 2)) THError("matrices expected, got %dD, %dD tensors", m1->nDimension, m2->nDimension); if(m1->size[1] != m2->size[0]) THError("size mismatch, m1: %ld, m2: %ld", m1->size[1], m2->size[0]); if( t->nDimension != 2 ) THError("matrix expected, got %dD tensor for t", t->nDimension); if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) ) THError("size mismatch, t: %ld, m1: %ld, t: %ld, m2: %ld", t->size[0], m1->size[1], t->size[1], m2->size[1]); if(t != r_) THError("Not implemented: t != r"); /* printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */ /* r_ */ if(r_->stride[0] == 1 && r_->stride[1] != 0) { transpose_r = 'n'; r__ = r_; } else if(r_->stride[1] == 1 && r_->stride[0] != 0) { THFloatTensor *swap = m2; m2 = m1; m1 = swap; transpose_r = 't'; r__ = r_; } else { THError("Transpose not implemented (1)"); return; /* transpose_r = 'n'; r__ = THFloatTensor_newWithSize2d(r_->size[1], r_->size[0]); THFloatTensor_copy(r__, r_); THFloatTensor_transpose(r__, NULL, 0, 1);*/ } /* m1 */ if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0) { transpose_m1 = 'n'; m1_ = m1; } else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0) { transpose_m1 = 't'; m1_ = m1; } else { THError("Transpose not implemented (2)"); return; /*transpose_m1 = (transpose_r == 'n' ? 't' : 'n'); m1_ = THFloatTensor_newContiguous(m1);*/ } /* m2 */ if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0) { transpose_m2 = 'n'; m2_ = m2; } else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0) { transpose_m2 = 't'; m2_ = m2; } else { THError("Transpose not implemented (3)"); return; /*transpose_m2 = (transpose_r == 'n' ? 't' : 'n'); m2_ = THFloatTensor_(newContiguous)(m2);*/ } /* do the operation */ THBlas_gemm(transpose_m1, transpose_m2, r__->size[(transpose_r == 'n' ? 0 : 1)], r__->size[(transpose_r == 'n' ? 1 : 0)], m1_->size[(transpose_r == 'n' ? 1 : 0)], alpha, THFloatTensor_data(m1_), (transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]), THFloatTensor_data(m2_), (transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]), beta, THFloatTensor_data(r__), r__->stride[(transpose_r == 'n' ? 1 : 0)]); /* free intermediate variables */ if(m1_ != m1) THFloatTensor_free(m1_); if(m2_ != m2) THFloatTensor_free(m2_); if(r__ != r_) THError("freeCopyTo not implemented"); /*THFloatTensor_(freeCopyTo)(r__, r_);*/ }
int BilinearSamplerBHWD_updateOutput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *output) { int batchsize = THFloatTensor_size(inputImages, 0); int inputImages_height = THFloatTensor_size(inputImages, 1); int inputImages_width = THFloatTensor_size(inputImages, 2); int output_height = THFloatTensor_size(output, 1); int output_width = THFloatTensor_size(output, 2); int inputImages_channels = THFloatTensor_size(inputImages, 3); int output_strideBatch = THFloatTensor_stride(output, 0); int output_strideHeight = THFloatTensor_stride(output, 1); int output_strideWidth = THFloatTensor_stride(output, 2); int inputImages_strideBatch = THFloatTensor_stride(inputImages, 0); int inputImages_strideHeight = THFloatTensor_stride(inputImages, 1); int inputImages_strideWidth = THFloatTensor_stride(inputImages, 2); int grids_strideBatch = THFloatTensor_stride(grids, 0); int grids_strideHeight = THFloatTensor_stride(grids, 1); int grids_strideWidth = THFloatTensor_stride(grids, 2); real *inputImages_data, *output_data, *grids_data; inputImages_data = THFloatTensor_data(inputImages); output_data = THFloatTensor_data(output); grids_data = THFloatTensor_data(grids); int b, yOut, xOut; for(b=0; b < batchsize; b++) { for(yOut=0; yOut < output_height; yOut++) { for(xOut=0; xOut < output_width; xOut++) { //read the grid real yf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth]; real xf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + 1]; // get the weights for interpolation int yInTopLeft, xInTopLeft; real yWeightTopLeft, xWeightTopLeft; real xcoord = (xf + 1) * (inputImages_width - 1) / 2; xInTopLeft = floor(xcoord); xWeightTopLeft = 1 - (xcoord - xInTopLeft); real ycoord = (yf + 1) * (inputImages_height - 1) / 2; yInTopLeft = floor(ycoord); yWeightTopLeft = 1 - (ycoord - yInTopLeft); const int outAddress = output_strideBatch * b + output_strideHeight * yOut + output_strideWidth * xOut; const int inTopLeftAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeft + inputImages_strideWidth * xInTopLeft; const int inTopRightAddress = inTopLeftAddress + inputImages_strideWidth; const int inBottomLeftAddress = inTopLeftAddress + inputImages_strideHeight; const int inBottomRightAddress = inBottomLeftAddress + inputImages_strideWidth; real v=0; real inTopLeft=0; real inTopRight=0; real inBottomLeft=0; real inBottomRight=0; // we are careful with the boundaries bool topLeftIsIn = xInTopLeft >= 0 && xInTopLeft <= inputImages_width-1 && yInTopLeft >= 0 && yInTopLeft <= inputImages_height-1; bool topRightIsIn = xInTopLeft+1 >= 0 && xInTopLeft+1 <= inputImages_width-1 && yInTopLeft >= 0 && yInTopLeft <= inputImages_height-1; bool bottomLeftIsIn = xInTopLeft >= 0 && xInTopLeft <= inputImages_width-1 && yInTopLeft+1 >= 0 && yInTopLeft+1 <= inputImages_height-1; bool bottomRightIsIn = xInTopLeft+1 >= 0 && xInTopLeft+1 <= inputImages_width-1 && yInTopLeft+1 >= 0 && yInTopLeft+1 <= inputImages_height-1; int t; // interpolation happens here for(t=0; t<inputImages_channels; t++) { if(topLeftIsIn) inTopLeft = inputImages_data[inTopLeftAddress + t]; if(topRightIsIn) inTopRight = inputImages_data[inTopRightAddress + t]; if(bottomLeftIsIn) inBottomLeft = inputImages_data[inBottomLeftAddress + t]; if(bottomRightIsIn) inBottomRight = inputImages_data[inBottomRightAddress + t]; v = xWeightTopLeft * yWeightTopLeft * inTopLeft + (1 - xWeightTopLeft) * yWeightTopLeft * inTopRight + xWeightTopLeft * (1 - yWeightTopLeft) * inBottomLeft + (1 - xWeightTopLeft) * (1 - yWeightTopLeft) * inBottomRight; output_data[outAddress + t] = v; } } } } return 1; }
int BilinearSamplerBCHW_updateGradInput(THFloatTensor *inputImages, THFloatTensor *grids, THFloatTensor *gradInputImages, THFloatTensor *gradGrids, THFloatTensor *gradOutput) { bool onlyGrid=false; int batchsize = THFloatTensor_size(inputImages, 0); int inputImages_height = THFloatTensor_size(inputImages, 2); int inputImages_width = THFloatTensor_size(inputImages, 3); int gradOutput_height = THFloatTensor_size(gradOutput, 2); int gradOutput_width = THFloatTensor_size(gradOutput, 3); int inputImages_channels = THFloatTensor_size(inputImages, 1); int gradOutput_strideBatch = THFloatTensor_stride(gradOutput, 0); int gradOutput_strideHeight = THFloatTensor_stride(gradOutput, 2); int gradOutput_strideWidth = THFloatTensor_stride(gradOutput, 3); int gradOutput_strideChannel = THFloatTensor_stride(gradOutput, 1); int inputImages_strideBatch = THFloatTensor_stride(inputImages, 0); int inputImages_strideHeight = THFloatTensor_stride(inputImages, 2); int inputImages_strideWidth = THFloatTensor_stride(inputImages, 3); int inputImages_strideChannel = THFloatTensor_stride(inputImages, 1); int gradInputImages_strideBatch = THFloatTensor_stride(gradInputImages, 0); int gradInputImages_strideHeight = THFloatTensor_stride(gradInputImages, 2); int gradInputImages_strideWidth = THFloatTensor_stride(gradInputImages, 3); int gradInputImages_strideChannel = THFloatTensor_stride(gradInputImages, 1); int grids_strideBatch = THFloatTensor_stride(grids, 0); int grids_strideHeight = THFloatTensor_stride(grids, 2); int grids_strideWidth = THFloatTensor_stride(grids, 3); int grids_strideChannel = THFloatTensor_stride(grids, 1); int gradGrids_strideBatch = THFloatTensor_stride(gradGrids, 0); int gradGrids_strideHeight = THFloatTensor_stride(gradGrids, 2); int gradGrids_strideWidth = THFloatTensor_stride(gradGrids, 3); int gradGrids_strideChannel = THFloatTensor_stride(gradGrids, 1); real *inputImages_data, *gradOutput_data, *grids_data, *gradGrids_data, *gradInputImages_data; inputImages_data = THFloatTensor_data(inputImages); gradOutput_data = THFloatTensor_data(gradOutput); grids_data = THFloatTensor_data(grids); gradGrids_data = THFloatTensor_data(gradGrids); gradInputImages_data = THFloatTensor_data(gradInputImages); int b, yOut, xOut; for(b=0; b < batchsize; b++) { for(yOut=0; yOut < gradOutput_height; yOut++) { for(xOut=0; xOut < gradOutput_width; xOut++) { //read the grid real xf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + grids_strideChannel]; real yf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth]; // get the weights for interpolation int yInTopLeft, xInTopLeft; real yWeightTopLeft, xWeightTopLeft; real xcoord = (xf + 1) * (inputImages_width - 1) / 2; xInTopLeft = floor(xcoord); xWeightTopLeft = 1 - (xcoord - xInTopLeft); real ycoord = (yf + 1) * (inputImages_height - 1) / 2; yInTopLeft = floor(ycoord); yWeightTopLeft = 1 - (ycoord - yInTopLeft); const int inTopLeftAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeft + inputImages_strideWidth * xInTopLeft; const int inTopRightAddress = inTopLeftAddress + inputImages_strideWidth; const int inBottomLeftAddress = inTopLeftAddress + inputImages_strideHeight; const int inBottomRightAddress = inBottomLeftAddress + inputImages_strideWidth; const int gradInputImagesTopLeftAddress = gradInputImages_strideBatch * b + gradInputImages_strideHeight * yInTopLeft + gradInputImages_strideWidth * xInTopLeft; const int gradInputImagesTopRightAddress = gradInputImagesTopLeftAddress + gradInputImages_strideWidth; const int gradInputImagesBottomLeftAddress = gradInputImagesTopLeftAddress + gradInputImages_strideHeight; const int gradInputImagesBottomRightAddress = gradInputImagesBottomLeftAddress + gradInputImages_strideWidth; const int gradOutputAddress = gradOutput_strideBatch * b + gradOutput_strideHeight * yOut + gradOutput_strideWidth * xOut; real topLeftDotProduct = 0; real topRightDotProduct = 0; real bottomLeftDotProduct = 0; real bottomRightDotProduct = 0; // we are careful with the boundaries bool topLeftIsIn = xInTopLeft >= 0 && xInTopLeft <= inputImages_width-1 && yInTopLeft >= 0 && yInTopLeft <= inputImages_height-1; bool topRightIsIn = xInTopLeft+1 >= 0 && xInTopLeft+1 <= inputImages_width-1 && yInTopLeft >= 0 && yInTopLeft <= inputImages_height-1; bool bottomLeftIsIn = xInTopLeft >= 0 && xInTopLeft <= inputImages_width-1 && yInTopLeft+1 >= 0 && yInTopLeft+1 <= inputImages_height-1; bool bottomRightIsIn = xInTopLeft+1 >= 0 && xInTopLeft+1 <= inputImages_width-1 && yInTopLeft+1 >= 0 && yInTopLeft+1 <= inputImages_height-1; int t; for(t=0; t<inputImages_channels; t++) { real gradOutValue = gradOutput_data[gradOutputAddress + t * gradOutput_strideChannel]; if(topLeftIsIn) { real inTopLeft = inputImages_data[inTopLeftAddress + t * inputImages_strideChannel]; topLeftDotProduct += inTopLeft * gradOutValue; if(!onlyGrid) gradInputImages_data[gradInputImagesTopLeftAddress + t * gradInputImages_strideChannel] += xWeightTopLeft * yWeightTopLeft * gradOutValue; } if(topRightIsIn) { real inTopRight = inputImages_data[inTopRightAddress + t * inputImages_strideChannel]; topRightDotProduct += inTopRight * gradOutValue; if(!onlyGrid) gradInputImages_data[gradInputImagesTopRightAddress + t * gradInputImages_strideChannel] += (1 - xWeightTopLeft) * yWeightTopLeft * gradOutValue; } if(bottomLeftIsIn) { real inBottomLeft = inputImages_data[inBottomLeftAddress + t * inputImages_strideChannel]; bottomLeftDotProduct += inBottomLeft * gradOutValue; if(!onlyGrid) gradInputImages_data[gradInputImagesBottomLeftAddress + t * gradInputImages_strideChannel] += xWeightTopLeft * (1 - yWeightTopLeft) * gradOutValue; } if(bottomRightIsIn) { real inBottomRight = inputImages_data[inBottomRightAddress + t * inputImages_strideChannel]; bottomRightDotProduct += inBottomRight * gradOutValue; if(!onlyGrid) gradInputImages_data[gradInputImagesBottomRightAddress + t * gradInputImages_strideChannel] += (1 - xWeightTopLeft) * (1 - yWeightTopLeft) * gradOutValue; } } xf = - yWeightTopLeft * topLeftDotProduct + yWeightTopLeft * topRightDotProduct - (1-yWeightTopLeft) * bottomLeftDotProduct + (1-yWeightTopLeft) * bottomRightDotProduct; yf = - xWeightTopLeft * topLeftDotProduct + xWeightTopLeft * bottomLeftDotProduct - (1-xWeightTopLeft) * topRightDotProduct + (1-xWeightTopLeft) * bottomRightDotProduct; gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + gradGrids_strideChannel] = xf * (inputImages_width-1) / 2; gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth] = yf * (inputImages_height-1) / 2; } } } return 1; }
static int luafunc_load(lua_State *L) { THFloatTensor *t = 0; const char *tname = luaT_typename(L, 1); int i, index = lua_tointeger(L, 2); if(max == 0) luaL_error(L, "fastimage.init: call init first"); if(index > nsizes) luaL_error(L, "Invalid size index %d", index); index--; if(index < 0) index = 0; if(tname && !strcmp(tname, "torch.FloatTensor")) { t = luaT_toudata(L, 1, luaT_typenameid(L, "torch.FloatTensor")); if(t->nDimension == 4 && t->size[1] == 3) { if(nsizes == 1) { sizes[0].width = t->size[3]; sizes[0].height = t->size[2]; max = t->size[0]; } else if(sizes[0].width != t->size[3] || sizes[0].height != t->size[2] || max != t->size[0]) t = 0; } else t = 0; } if(!index) { for(i = 0; i < max; i++) if(images[i].bitmap) { free(images[i].bitmap); images[i].bitmap = 0; } for(i = 0; i < max; i++) { if(loadnextimage(images + i)) break; } if(i == 0) { lprintf("Nothing found\n"); return 0; } if(i < max) { max = i; if(t) t = THFloatTensor_newNarrow(t, 0, 0, i); } } for(i = 0; i < max; i++) { if(nsizes == 1 && (!sizes[0].width || !sizes[0].height)) { lprintf("Set width = %d, height = %d\n", images[i].width, images[i].height); sizes[0].width = images[i].width; sizes[0].height = images[i].height; } if(!t) t = THFloatTensor_newWithSize4d(max, 3, sizes[index].height, sizes[index].width); uint8_t *rescaled = scale(images + i, sizes[index].width, sizes[index].height); rgb_tofloat(THFloatTensor_data(t) + i * t->stride[0], t->stride[1], t->stride[2], rescaled, sizes[index].width, sizes[index].height); if(rescaled != images[i].bitmap) free(rescaled); if(nsizes == 1 && images[i].bitmap) { // It's not necessary to keep all the images in memory, if there is only one size free(images[i].bitmap); images[i].bitmap = 0; } } lprintf("%d x 3 x %d x %d tensor returned\n", i, sizes[index].height, sizes[index].width); luaT_pushudata(L, t, "torch.FloatTensor"); lua_createtable(L, max, 0); for(i = 0; i < max; i++) { lua_pushinteger(L, i+1); lua_createtable(L, 0, 3); lua_pushstring(L, "filename"); lua_pushstring(L, images[i].filename); lua_settable(L, -3); lua_pushstring(L, "width"); lua_pushinteger(L, images[i].width); lua_settable(L, -3); lua_pushstring(L, "height"); lua_pushinteger(L, images[i].height); lua_settable(L, -3); lua_settable(L, -3); } return 2; }
static int luafunc_init(lua_State *L) { struct stat st; const char *path = lua_tostring(L, 1); max = lua_tointeger(L, 2); if(!path) luaL_error(L, "fastimage.init: path has to be a string"); if(max < 1) luaL_error(L, "fastimage.init: max has to be a positive number"); strcpy(initpath, path); const char *tname = luaT_typename(L, 3); if(images) { int i; for(i = 0; i < max; i++) if(images[i].bitmap) free(images[i].bitmap); free(images); images = 0; } if(sizes) { free(sizes); sizes = 0; } nsizes = 0; if(tname && !strcmp(tname, "torch.FloatTensor")) { THFloatTensor *t = luaT_toudata(L, 3, luaT_typenameid(L, "torch.FloatTensor")); if(t->nDimension == 2 && t->size[1] == 2) { int i; nsizes = t->size[0]; sizes = (imgsize_t *)malloc(nsizes * sizeof(imgsize_t)); float *data = THFloatTensor_data(t); for(i = 0; i < nsizes; i++) { sizes[i].width = data[i * t->stride[0]]; sizes[i].height = data[i * t->stride[0] + 1]; } if(lua_isnumber(L, 4)) greylevel = (int)(255 * lua_tonumber(L, 4)); else greylevel = -1; } else t = 0; } else { nsizes = 1; sizes = (imgsize_t *)malloc(sizeof(imgsize_t)); sizes[0].width = lua_tointeger(L, 3); sizes[0].height = lua_tointeger(L, 4); if(lua_isnumber(L, 5)) greylevel = (int)(255 * lua_tonumber(L, 5)); else greylevel = -1; } images = (img_t *)calloc(max, sizeof(img_t)); lprintf("fastimage.init(%s, %d, %d, %d, %d)\n", path, max, sizes[0].width, sizes[0].height, greylevel); terminate = 0; if(dir) { closedir(dir); dir = 0; } if(!stat(path, &st)) { if(S_ISREG(st.st_mode)) return 0; else if(S_ISDIR(st.st_mode)) { lprintf("opendir %s\n", path); dir = opendir(path); if(!dir) luaL_error(L, "fastimage.init: failed to open directory %s", path); return 0; } else luaL_error(L, "fastimage.init: %s is neither a file, nor a directory", path); } else luaL_error(L, "fastimage.init: Cannot stat %s", path); return 0; }
THFloatTensor *nn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input) { int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int dimw = 2; int dimh = 1; if (input->nDimension == 4) { dimw++; dimh++; } long nOutputPlane = weight->size[0]; long kW = weight->size[3]; long kH = weight->size[2]; long inputWidth = input->size[dimw]; long inputHeight = input->size[dimh]; long outputWidth = (inputWidth - kW) / dW + 1; long outputHeight = (inputHeight - kH) / dH + 1; if (input->nDimension == 3) { long i; float *bias_data; float *output_data; THFloatTensor_resize3d(output, nOutputPlane, outputHeight, outputWidth); /* add bias */ bias_data = THFloatTensor_data(bias); output_data = THFloatTensor_data(output); #pragma omp parallel for private(i) for (i=0; i<bias->size[0]; i++) { float *ptr_output = output_data + i*outputWidth*outputHeight; long j; for(j = 0; j < outputWidth*outputHeight; j++) ptr_output[j] = bias_data[i]; } THFloatTensor_conv2Dmv(output, 1.0, 1.0, input, weight, dH, dW, "V","X"); } else { float *bias_data; float *output_data; long p; THFloatTensor_resize4d(output, input->size[0], nOutputPlane, outputHeight, outputWidth); bias_data = THFloatTensor_data(bias); output_data = THFloatTensor_data(output); #pragma omp parallel for private(p) for (p=0; p<input->size[0]; p++) { /* BIAS */ long i; for (i=0; i<bias->size[0]; i++) { float *ptr_output = output_data + p*nOutputPlane*outputWidth*outputHeight + i*outputWidth*outputHeight; long j; for(j = 0; j < outputWidth*outputHeight; j++) ptr_output[j] = bias_data[i]; } } /* do convolutions */ THFloatTensor_conv2Dmm(output, 1.0, 1.0, input, weight, dH, dW, "V","X"); } return output; }
void THFloatTensor_conv2Dmm(THFloatTensor *r_, float beta, float alpha, THFloatTensor *t_, THFloatTensor *k_, long srow, long scol, const char *vf, const char *xc) { long nInputPlane, nInputRows, nInputCols; long nKernelRows, nKernelCols; long nOutputPlane, nOutputRows, nOutputCols; long kstride0, kstride1; THFloatTensor *input; THFloatTensor* kernel; long nbatch; long nelem; float *input_data; float *weight_data; float *output_data; long p; if(t_->nDimension != 4) THError("input: 3D Tensor expected"); if(k_->nDimension != 4) THError("kernel: 4D Tensor expected"); if(srow < 1) THError("Stride should be a positive integer"); if(scol < 1) THError("Stride should be a positive integer"); if(*vf != 'V' || *xc != 'X') THError("Type of convolution can be 'V','X' only"); input = t_; kernel = k_; nbatch = input->size[0]; nInputPlane = input->size[1]; nInputRows = input->size[2]; nInputCols = input->size[3]; kstride0 = kernel->stride[0]; kstride1 = kernel->stride[1]; nKernelRows = kernel->size[2]; nKernelCols = kernel->size[3]; nOutputPlane = kernel->size[0]; if(kernel->size[1] != nInputPlane) THError("invalid number of input planes"); if(!(nInputRows >= nKernelRows && nInputCols >= nKernelCols)) THError("conv2Dmv : Input image is smaller than kernel"); nOutputRows = (nInputRows - nKernelRows) / srow + 1; nOutputCols = (nInputCols - nKernelCols) / scol + 1; nelem = THFloatTensor_nElement(r_); THFloatTensor_resize4d(r_, nbatch, nOutputPlane, nOutputRows, nOutputCols); input_data = THFloatTensor_data(input); weight_data = THFloatTensor_data(kernel); output_data = THFloatTensor_data(r_); if (nelem == 0 || beta == 0 || nelem != THFloatTensor_nElement(r_)) { /*THFloatTensor_(zero)(r_);*/ #pragma omp parallel for private(p) for (p=0; p < r_->size[0]; p++) { long k; for (k = 0; k < r_->size[1]; k++) { float* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; long l; for (l = 0; l < nOutputRows*nOutputCols; l++) ptr_output[l] = 0.0; } } } else if (beta != 1) { /*THFloatTensor_(mul)(r_, beta);*/ #pragma omp parallel for private(p) for(p=0; p < r_->size[0]; p++) { long k; for (k = 0; k < r_->size[1]; k++) { float* ptr_output = output_data + p*nOutputPlane*nOutputRows*nOutputCols + k*nOutputCols*nOutputRows; long l; for (l = 0; l < nOutputRows*nOutputCols; l++) ptr_output[l] *= beta; } } } #pragma omp parallel for private(p) for(p=0; p < nbatch; p++) { long k; for(k = 0; k < nOutputPlane; k++) { long i; /* get output */ float *ptr_output = output_data + p*nOutputPlane*nOutputCols*nOutputRows + k*nOutputCols*nOutputRows; for(i = 0; i < nInputPlane; i++) { /* get kernel */ float *ptr_weight = weight_data + k*kstride0 + i*kstride1; /* get input */ float *ptr_input = input_data + p*nInputPlane*nInputRows*nInputCols + i*nInputRows*nInputCols; /* do image, kernel convolution */ THFloatTensor_validXCorr2Dptr(ptr_output, alpha, ptr_input, nInputRows, nInputCols, ptr_weight, nKernelRows, nKernelCols, srow, scol); } } } }
THFloatTensor *cudnn_SpatialConvolution_updateOutput(struct module *module, THFloatTensor *input) { int kW = module->SpatialConvolution.kW; int kH = module->SpatialConvolution.kH; int dW = module->SpatialConvolution.dW; int dH = module->SpatialConvolution.dH; int padW = module->SpatialConvolution.padW; int padH = module->SpatialConvolution.padH; int nInputPlane = module->SpatialConvolution.nInputPlane; int nOutputPlane = module->SpatialConvolution.nOutputPlane; THFloatTensor *weight = module->SpatialConvolution.weight; THFloatTensor *bias = module->SpatialConvolution.bias; THFloatTensor *output = module->output; int sizes[4]; int pad[2], filterStride[2], upscale[2]; cudnnTensorDescriptor_t dinput, dbias, doutput; cudnnConvolutionDescriptor_t dconv; cudnnFilterDescriptor_t dweight; float one = 1, zero = 0; size_t reqwssize; static void *ws; static size_t wssize; static const int alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; pad[0] = padH; pad[1] = padW; filterStride[0] = dH; filterStride[1] = dW; upscale[0] = 1; upscale[1] = 1; if(input->nDimension <= 2) { // Here we use the SpatialConvolution module to perform a linear transformation errcheck(cudnnCreateTensorDescriptor(&dinput)); if(input->nDimension == 1) errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, 1, input->size[0], 1, 1)); else errcheck(cudnnSetTensor4dDescriptor(dinput, CUDNN_TENSOR_NCHW, floattype, input->size[0], input->size[1], 1, 1)); } else errcheck(THcudnn_TensorDescriptor(&dinput, input)); errcheck(cudnnCreateFilterDescriptor(&dweight)); errcheck(cudnnSetFilter4dDescriptor(dweight, floattype, nOutputPlane, nInputPlane, kH, kW)); errcheck(cudnnCreateTensorDescriptor(&dbias)); errcheck(cudnnSetTensor4dDescriptor(dbias, CUDNN_TENSOR_NCHW, floattype, 1, bias->size[0], 1, 1)); errcheck(cudnnCreateConvolutionDescriptor(&dconv)); errcheck(cudnnSetConvolutionNdDescriptor(dconv, 2, pad, filterStride, upscale, CUDNN_CROSS_CORRELATION, floattype)); errcheck(cudnnGetConvolutionNdForwardOutputDim(dconv, dinput, dweight, 4, sizes)); THCudaTensor_resize4d(output, sizes[0], sizes[1], sizes[2], sizes[3]); errcheck(THcudnn_TensorDescriptor(&doutput, output)); if(alg == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_GEMM || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT || alg == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) { errcheck(cudnnGetConvolutionForwardWorkspaceSize(THcudnn_getHandle(), dinput, dweight, dconv, doutput, alg, &reqwssize)); if(reqwssize > wssize) { wssize = reqwssize; errcheck(cudaMalloc(&ws, reqwssize)); } } errcheck(cudnnConvolutionForward(THcudnn_getHandle(), &one, dinput, THFloatTensor_data(input), dweight, THFloatTensor_data(weight), dconv, alg, ws, wssize, &zero, doutput, THFloatTensor_data(output))); errcheck(cudnnAddTensor_v3(THcudnn_getHandle(), &one, dbias, THFloatTensor_data(bias), &one, doutput, THFloatTensor_data(output))); cudnnDestroyTensorDescriptor(dinput); cudnnDestroyFilterDescriptor(dweight); cudnnDestroyTensorDescriptor(dbias); cudnnDestroyTensorDescriptor(doutput); cudnnDestroyConvolutionDescriptor(dconv); return output; }