THFloatTensor *nn_SpatialConvolutionMM_updateOutput(struct module *module, THFloatTensor *input)
{
	int kW = module->SpatialConvolution.kW;
	int kH = module->SpatialConvolution.kH;
	int dW = module->SpatialConvolution.dW;
	int dH = module->SpatialConvolution.dH;
	int padW = module->SpatialConvolution.padW;
	int padH = module->SpatialConvolution.padH;

	THFloatTensor *finput = module->SpatialConvolution.finput;
	THFloatTensor *weight = module->SpatialConvolution.weight;
	THFloatTensor *bias   = module->SpatialConvolution.bias;
	THFloatTensor *output = module->output;

	int batch = 1;
	if (input->nDimension == 3) {
		batch = 0;
		THFloatTensor_resize4d(input, 1, input->size[0], input->size[1], input->size[2]);
	}

	long batchSize = input->size[0];
	long nInputPlane  = module->SpatialConvolution.nInputPlane;
	long nOutputPlane = module->SpatialConvolution.nOutputPlane;
	long inputWidth   = input->size[3];
	long inputHeight  = input->size[2];
	long outputWidth  = (inputWidth + 2*padW - kW) / dW + 1;
	long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;


	if (outputWidth < 1 || outputHeight < 1)
		THError("Given input size: (%dx%dx%d). Calculated output size: (%dx%dx%d). Output size is too small",
		nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);

	THFloatTensor_resize3d(finput, batchSize, kW*kH*nInputPlane, outputHeight*outputWidth);
	THFloatTensor_resize4d(output, batchSize, nOutputPlane, outputHeight, outputWidth);

	long t;
#pragma omp parallel for if(batchSize >= 4) private(t)
	for (t = 0; t < batchSize; t++) {
		THFloatTensor *input_t = THFloatTensor_newSelect(input, 0, t);
		THFloatTensor *output_t = THFloatTensor_newSelect(output, 0, t);
		THFloatTensor *finput_t = THFloatTensor_newSelect(finput, 0, t);

		nn_SpatialConvolutionMM_updateOutput_frame(input_t, output_t, weight, bias, finput_t,
			kW, kH, dW, dH, padW, padH,
			nInputPlane, inputWidth, inputHeight,
			nOutputPlane, outputWidth, outputHeight);

		THFloatTensor_free(input_t);
		THFloatTensor_free(output_t);
		THFloatTensor_free(finput_t);
	}

	if (batch == 0) {
		THFloatTensor_resize3d(output, nOutputPlane, outputHeight, outputWidth);
		THFloatTensor_resize3d(input, nInputPlane, inputHeight, inputWidth);
	}

	return output;
}
Exemple #2
0
int THProcessFloat(THNETWORK *network, float *data, int batchsize, int width, int height, float **result, int *outwidth, int *outheight)
{
	int b, c, i;
	THFloatTensor *t = THFloatTensor_new();
	THFloatTensor *out;
	t->nDimension = 4;
	t->size[0] = batchsize;
	t->size[1] = 3;
	t->size[2] = height;
	t->size[3] = width;
	t->stride[0] = 3 * width * height;
	t->stride[1] = width * height;
	t->stride[2] = width;
	t->stride[3] = 1;
	t->storage = THFloatStorage_newwithbuffer((float *)data);
#pragma omp parallel for private(b, c, i)
	for(b = 0; b < batchsize; b++)
		for(c = 0; c < 3; c++)
			for(i = 0; i < width*height; i++)
				data[b * t->stride[0] + c * t->stride[1] + i] =
					(data[b * t->stride[0] + c * t->stride[1] + i] - network->mean[c]) / network->std[c];
#ifdef CUDNN
	if(network->net->cuda)
	{
		THFloatTensor *t2 = THCudaTensor_newFromFloatTensor(t);
		out = forward(network->net, t2);
		THFloatTensor_free(t2);
		if(network->out)
			THFloatTensor_free(network->out);
		network->out = THFloatTensor_newFromCudaTensor(out);
		out = network->out;
	} else
#endif
#ifdef OPENCL
	if(network->net->opencl)
	{
		THFloatTensor *t2 = THOpenCLTensor_newFromImageTensor(t);
		out = forward(network->net, t2);
		THFloatTensor_free(t2);
		if(network->out)
			THFloatTensor_free(network->out);
		network->out = THFloatTensor_newFromOpenCLImageTensor(out);
		out = network->out;
	} else
#endif
	out = forward(network->net, t);
	THFloatTensor_free(t);
	*result = out->storage->data;
	if(out->nDimension >= 3)
	{
		*outwidth = out->size[out->nDimension - 1];
		*outheight = out->size[out->nDimension - 2];
	} else *outwidth = *outheight = 1;
	return THFloatTensor_nElement(out);
}
Exemple #3
0
void THMakeSpatial(THNETWORK *network)
{
	int i, size = 231, nInputPlane = 3;
	
	for(i = 0; i < network->net->nelem; i++)
	{
		if(network->net->modules[i].type == MT_View || network->net->modules[i].type == MT_Reshape)
		{
			THFloatTensor_free(network->net->modules[i].output);
			memmove(network->net->modules+i, network->net->modules+i+1, sizeof(*network->net->modules) * (network->net->nelem - i - 1));
			network->net->nelem--;
			i--;
		} else if(network->net->modules[i].type == MT_Linear)
		{
			THFloatTensor_free(network->net->modules[i].Linear.addBuffer);
			network->net->modules[i].updateOutput = nn_SpatialConvolutionMM_updateOutput;
#ifndef USEBLAS
			network->net->modules[i].type = MT_SpatialConvolutionVirtMM;
#else
			network->net->modules[i].type = MT_SpatialConvolutionMM;
#endif
			struct SpatialConvolution *c = &network->net->modules[i].SpatialConvolution;
			c->finput = THFloatTensor_new();
			c->padW = c->padH = 0;
			c->dW = c->dH = 1;
			c->kW = c->kH = size;
			c->nInputPlane = nInputPlane;
			nInputPlane = c->nOutputPlane = c->weight->size[0];
			size = (size + 2*c->padW - c->kW) / c->dW + 1;
		} else if(network->net->modules[i].type == MT_SpatialConvolution ||
			network->net->modules[i].type == MT_SpatialConvolutionMM ||
			network->net->modules[i].type == MT_SpatialConvolutionVirtMM)
		{
			struct SpatialConvolution *c = &network->net->modules[i].SpatialConvolution;
			size = (size + 2*c->padW - c->kW) / c->dW + 1;
			nInputPlane = network->net->modules[i].SpatialConvolution.nOutputPlane;
		} else if(network->net->modules[i].type == MT_SpatialMaxPooling)
		{
			struct SpatialMaxPooling *c = &network->net->modules[i].SpatialMaxPooling;
			if(c->ceil_mode)
				size = (long)(ceil((float)(size - c->kH + 2*c->padH) / c->dH)) + 1;
			else size = (long)(floor((float)(size - c->kH + 2*c->padH) / c->dH)) + 1;
		} else if(network->net->modules[i].type == MT_SpatialZeroPadding)
		{
			struct SpatialZeroPadding *c = &network->net->modules[i].SpatialZeroPadding;
			size += c->pad_l + c->pad_r;
		}
	}
}
Exemple #4
0
int THProcessYUYV(THNETWORK *network, unsigned char *image, int width, int height, float **results, int *outwidth, int *outheight)
{
	THFloatTensor *out;
	THFloatStorage *st;

#ifdef CUDNN
	if(network->net->cuda)
		THError("This function is not supported with CUDNN");
#endif
#ifdef OPENCL
	if(network->net->cuda)
		THError("This function is not supported with OpenCL");
#endif
	st = THFloatStorage_new(width * height * 3);
	yuyv2fRGB(image, st->data, width*height, width, width, height, network->mean, network->std);
	THFloatTensor *t = THFloatTensor_new();
	t->storage = st;
	t->nDimension = 3;
	t->size[0] = 3;
	t->size[1] = height;
	t->size[2] = width;
	t->stride[0] = width * height;
	t->stride[1] = width;
	t->stride[2] = 1;
	out = forward(network->net, t);
	THFloatTensor_free(t);
	*results = out->storage->data;
	if(out->nDimension >= 3)
	{
		*outwidth = out->size[out->nDimension - 1];
		*outheight = out->size[out->nDimension - 2];
	} else *outwidth = *outheight = 1;
	return THFloatTensor_nElement(out);
}
static void nn_SpatialConvolutionMM_updateOutput_frame(THFloatTensor *input, THFloatTensor *output,
	THFloatTensor *weight, THFloatTensor *bias,
	THFloatTensor *finput,
	int kW, int kH, int dW, int dH, int padW, int padH,
	long nInputPlane, long inputWidth, long inputHeight,
	long nOutputPlane, long outputWidth, long outputHeight)
{
	nn_unfolded_copy(finput, input, kW, kH, dW, dH, padW, padH,
		nInputPlane, inputWidth, inputHeight, outputWidth, outputHeight);

	THFloatTensor *output2d = THFloatTensor_newWithStorage2d(output->storage, output->storageOffset,
		nOutputPlane, -1,
		outputHeight*outputWidth, -1);

	long i;
	for (i = 0; i < nOutputPlane; i++)
	{
		float *data = output->storage->data + output->storageOffset + output->stride[0]*i;
		float what = bias->storage->data[i];
		long len = outputHeight*outputWidth;
		THFloatVector_fill(data, what, len);
	}

	THFloatTensor_addmm(output2d, 1, output2d, 1, weight, finput);

	THFloatTensor_free(output2d);
}
// Copy extracted patches to CUDA memory and run the network
// One has to keep mind that GPU memory is limited and extracting too many patches
// at once might cause troubles
// So if you need to extract a lot of patches, an efficient way would be to
// devide the set in smaller equal parts and preallocate CPU and GPU memory
void extractDescriptors(THCState *state,
    cunn::Sequential::Ptr net,
    const std::vector<cv::Mat>& patches,
    cv::Mat& descriptors)
{
  size_t batch_size = 128;
  size_t N = patches.size();

  THFloatTensor *buffer = THFloatTensor_newWithSize4d(batch_size, 1, M, M);
  THCudaTensor *input = THCudaTensor_newWithSize4d(state, batch_size, 1, M, M);

  for(int j=0; j < ceil((float)N/batch_size); ++j)
  {
    float *data = THFloatTensor_data(buffer);
    size_t k = 0;
    for(size_t i = j*batch_size; i < std::min((j+1)*batch_size, N); ++i, ++k)
      memcpy(data + k*M*M, patches[i].data, sizeof(float) * M * M);

    // initialize 4D CUDA tensor and copy patches into it
    THCudaTensor_copyFloat(state, input, buffer);

    // propagate through the network
    THCudaTensor *output = net->forward(input);

    // copy descriptors back
    THFloatTensor *desc = THFloatTensor_newWithSize2d(output->size[0], output->size[1]);
    THFloatTensor_copyCuda(state, desc, output);

    size_t feature_dim = output->size[1];
    if(descriptors.cols != feature_dim || descriptors.rows != N)
      descriptors.create(N, feature_dim, CV_32F);

    memcpy(descriptors.data + j * feature_dim * batch_size * sizeof(float),
        THFloatTensor_data(desc),
        sizeof(float) * feature_dim * k);

    THFloatTensor_free(desc);
  }

  THCudaTensor_free(state, input);
  THFloatTensor_free(buffer);
}
void THCudaTensor_copyFloat(THCudaTensor *self, struct THFloatTensor *src)
{
  THArgCheck(THCudaTensor_nElement(self) == THFloatTensor_nElement(src), 2, "sizes do not match"); 

  {
    THCudaTensor *selfc = THCudaTensor_newContiguous(self);
    src = THFloatTensor_newContiguous(src);
  
    THCudaCheck(cudaMemcpy(selfc->storage->data + selfc->storageOffset, src->storage->data + src->storageOffset, THFloatTensor_nElement(src) * sizeof(float), cudaMemcpyHostToDevice));

    THFloatTensor_free(src);
    THCudaTensor_freeCopyTo(selfc, self);
  }
}
Exemple #8
0
void THFreeNetwork(THNETWORK *network)
{
	freenetwork(network->net);
	if(network->netobj)
	{
		freeobject(network->netobj);
		free(network->netobj);
	}
	if(network->statobj)
	{
		freeobject(network->statobj);
		free(network->statobj);
	}
	if(network->out)
		THFloatTensor_free(network->out);
	free(network);
}
Exemple #9
0
THFloatTensor *forward(struct network *net, THFloatTensor *in)
{
	int i;
	double t = 0, convtot = 0, convflops = 0;
	
#ifdef OPENCL
	if(net->opencl == 1)
		OpenCL_Build(net, in);
#endif
	for(i = 0; i < net->nelem; i++)
	{
		if(th_profile)
			t = th_seconds();
		in = net->modules[i].updateOutput(&net->modules[i], in);
		// You can remove these lines if you don't have problems with memory
		// These lines free intermediate results
		if(i > 0)
		{
			THFloatTensor_free(net->modules[i-1].output);
			net->modules[i-1].output = THFloatTensor_new();
		}
		if(th_profile)
		{
#ifdef OPENCL
			if(net->opencl)
				clFinish(cl_queue);
#endif
			t = th_seconds() - t;
			if(net->modules[i].type == MT_SpatialConvolutionMM ||
				net->modules[i].type == MT_SpatialConvolutionVirtMM ||
				net->modules[i].type == MT_SpatialConvolution)
			{
				double flops = 2.0 * THFloatTensor_nElement(in) * net->modules[i].SpatialConvolution.nInputPlane *
					net->modules[i].SpatialConvolution.kW * net->modules[i].SpatialConvolution.kH;
				printf("%f seconds for module %d, %f Gflops/s\n", t, i+1, flops * 1e-9 / t);
				convtot += t;
				convflops += flops;
			} else printf("%f seconds for module %d\n", t, i+1);
		}
		if(th_debug > 1)
			printf("%d) %d %d %ld %ld %ld %ld\n", i+1, net->modules[i].type, in->nDimension, in->size[0], in->size[1], in->size[2], in->size[3]);
	}
	if(th_profile)
		printf("%f seconds for convolutions %f Gflops/s\n", convtot, convflops * 1e-9 / convtot);
	return in;
}
Exemple #10
0
THFloatTensor *nn_View_updateOutput(struct module *module, THFloatTensor *input)
{
	long nElements = THFloatTensor_nElement(input);
	long numElements = module->View.numElements;
	long batchSize = nElements / numElements;
	THFloatTensor *view;

	if (batchSize > 1)
		view = THFloatTensor_newWithStorage2d(input->storage, input->storageOffset, batchSize, numElements, numElements, 1);
	else
		view = THFloatTensor_newWithStorage1d(input->storage, input->storageOffset, numElements, 1);

	THFloatTensor_free(module->output);
	module->output = view;

	return module->output;
}
Exemple #11
0
THFloatTensor *forward(struct network *net, THFloatTensor *in)
{
	int i;
	
	for(i = 0; i < net->nelem; i++)
	{
		in = net->modules[i].updateOutput(&net->modules[i], in);
		// You can remove these lines if you don't have problems with memory
		// These lines free intermediate results
		if(i > 0)
		{
			THFloatTensor_free(net->modules[i-1].output);
			net->modules[i-1].output = 0;
		}
		//printf("%d) %d %d %ld %ld %ld %ld\n", i+1, net->modules[i].type, in->nDimension, in->size[0], in->size[1], in->size[2], in->size[3]);
	}
	return in;
}
Exemple #12
0
int THProcessImages(THNETWORK *network, unsigned char **images, int batchsize, int width, int height, int stride, float **results, int *outwidth, int *outheight, int bgr)
{
	int i;
	THFloatTensor *out, *t = 0;
	THFloatStorage *st;
	
#ifdef CUDNN
	if(network->net->cuda)
	{
#ifdef HAVEFP16
		if(floattype == CUDNN_DATA_HALF)
		{
			st = THCudaStorage_new(batchsize * (width * height * 3));
			for(i = 0; i < batchsize; i++)
				cuda_rgb2half((unsigned short *)st->data + i * (width * height * 3), images[i], width, height, stride, network->mean, network->std, bgr);
		} else
#endif
		{
			st = THCudaStorage_new(batchsize * width * height * 3);
			for(i = 0; i < batchsize; i++)
				cuda_rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std, bgr);
		}
	} else
#endif
#ifdef OPENCL
	if(network->net->opencl)
		t = OpenCL_LoadImage(images[0], width, height, stride, network->mean, network->std, bgr);
	else
#endif
	{
		st = THFloatStorage_new(batchsize * width * height * 3);
		if(bgr)
#pragma omp parallel for if(batchsize>1) private(i)
			for(i = 0; i < batchsize; i++)
				bgr2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std);
		else
#pragma omp parallel for if(batchsize>1) private(i)
			for(i = 0; i < batchsize; i++)
				rgb2float(st->data + i * width * height * 3, images[i], width, height, stride, network->mean, network->std);
	}
	if(!t)
	{
		t = THFloatTensor_new();
		t->storage = st;
		if(batchsize == 1)
		{
			t->nDimension = 3;
			t->size[0] = 3;
			t->size[1] = height;
			t->size[2] = width;
			t->stride[0] = width * height;
			t->stride[1] = width;
			t->stride[2] = 1;
		} else {
			t->nDimension = 4;
			t->size[0] = batchsize;
			t->size[1] = 3;
			t->size[2] = height;
			t->size[3] = width;
			t->stride[0] = 3 * width * height;
			t->stride[1] = width * height;
			t->stride[2] = width;
			t->stride[3] = 1;
		}
	}
#ifdef CUDNN
	if(network->net->cuda)
	{
		out = forward(network->net, t);
		if(network->out)
			THFloatTensor_free(network->out);
#ifdef HAVEFP16
		if(floattype == CUDNN_DATA_HALF)
			network->out = THFloatTensor_newFromHalfCudaTensor(out);
		else
#endif
			network->out = THFloatTensor_newFromCudaTensor(out);
		out = network->out;
	} else
#endif
#ifdef OPENCL
	if(network->net->opencl)
	{
		out = forward(network->net, t);
		if(network->out)
			THFloatTensor_free(network->out);
#ifdef HAVEFP16
		if(cl_datasize == 2)
			network->out = THFloatTensor_newFromHalfOpenCLImageTensor(out);
		else
#endif
			network->out = THFloatTensor_newFromOpenCLImageTensor(out);
		out = network->out;
	} else
#endif
		out = forward(network->net, t);
	THFloatTensor_free(t);
	*results = out->storage->data;
	if(out->nDimension >= 3)
	{
		*outwidth = out->size[out->nDimension - 1];
		*outheight = out->size[out->nDimension - 2];
	} else *outwidth = *outheight = 1;
	return THFloatTensor_nElement(out);
}
Exemple #13
0
int main(int argc, char **argv)
{
	THNETWORK *net;
	float *result;
	int i, n = 0, rc, outwidth, outheight, runs = 1, print = 0, alg = 1, nbatch = 1;
	const char *modelsdir = 0, *inputfile = 0;

	for(i = 1; i < argc; i++)
	{
		if(argv[i][0] != '-')
			continue;
		switch(argv[i][1])
		{
		case 'm':
			if(i+1 < argc)
				modelsdir = argv[++i];
			break;
		case 'i':
			if(i+1 < argc)
				inputfile = argv[++i];
			break;
		case 'a':
			if(i+1 < argc)
				alg = atoi(argv[++i]);
			break;
		case 'p':
			print = 1;
			break;
		case 'r':
			if(i+1 < argc)
				runs = atoi(argv[++i]);
			break;
		case 'b':
			if(i+1 < argc)
			{
				nbatch = atoi(argv[++i]);
				if(nbatch > 256 || nbatch < 1)
					nbatch = 256;
			}
			break;
		}
	}
	if(!modelsdir || !inputfile)
	{
		fprintf(stderr, "Syntax: test -m <models directory> -i <input file>\n");
		fprintf(stderr, "             [-r <number of runs] [-p(rint results)]\n");
		fprintf(stderr, "             [-a <alg=0:norm,1:MM (default),2:virtMM,3:cuDNN,4:cudNNhalf>]\n");
		fprintf(stderr, "             [-b <nbatch>]\n");
		return -1;
	}
	if(alg == 4)
	{
		alg = 3;
		THCudaHalfFloat(1);
	}
	THInit();
	net = THLoadNetwork(modelsdir);
	if(net)
	{
		THMakeSpatial(net);
		if(alg == 0)
			THUseSpatialConvolutionMM(net, 0);
		else if(alg == 1 || alg == 2)
			THUseSpatialConvolutionMM(net, alg);
		else if(alg == 3)
		{
			THNETWORK *net2 = THCreateCudaNetwork(net);
			if(!net2)
				THError("CUDA not compiled in");
			THFreeNetwork(net);
			net = net2;
		}
		if(strstr(inputfile, ".t7"))
		{
			struct thobject input_o;

			rc = loadtorch(inputfile, &input_o, 8);
			if(!rc)
			{
				THFloatTensor *in = THFloatTensor_newFromObject(&input_o);
				// In CuDNN the first one has to do some initializations, so don't count it for timing
				if(alg == 3)
					THProcessFloat(net, in->storage->data, 1, in->size[2], in->size[1], &result, &outwidth, &outheight);
				t = seconds();
				for(i = 0; i < runs; i++)
					n = THProcessFloat(net, in->storage->data, 1, in->size[2], in->size[1], &result, &outwidth, &outheight);
				t = (seconds() - t) / runs;
				THFloatTensor_free(in);
				freeobject(&input_o);
			} else printf("Error loading %s\n", inputfile);
		} else {
			img_t image;

			rc = loadimage(inputfile, &image);
			if(!rc)
			{
				unsigned char *bitmaps[256];
				for(i = 0; i < nbatch; i++)
					bitmaps[i] = image.bitmap;
				// In CuDNN the first one has to do some initializations, so don't count it for timing
				if(alg == 3)
					THProcessImages(net, &image.bitmap, 1, image.width, image.height, 3*image.width, &result, &outwidth, &outheight, 0);
				t = seconds();
				for(i = 0; i < runs; i++)
					n = THProcessImages(net, bitmaps, nbatch, image.width, image.height, 3*image.width, &result, &outwidth, &outheight, 0);
				t = (seconds() - t) / runs;
#ifdef USECUDAHOSTALLOC
				cudaFreeHost(image.bitmap);
#else
				free(image.bitmap);
#endif
			} else printf("Error loading image %s\n", inputfile);
		}
		if(print)
			for(i = 0; i < n; i++)
				printf("(%d,%d,%d): %f\n", i/(outwidth*outheight), i % (outwidth*outheight) / outwidth, i % outwidth, result[i]);
		printf("1 run processing time: %lf\n", t);
        THFreeNetwork(net);
	} else printf("The network could not be loaded: %d\n", THLastError());
#ifdef MEMORYDEBUG
	debug_memorydump(stderr);
#endif
	return 0;
}
Exemple #14
0
void THFloatTensor_addmm(THFloatTensor *r_, float beta, THFloatTensor *t, float alpha, THFloatTensor *m1, THFloatTensor *m2)
{
	char transpose_r, transpose_m1, transpose_m2;
	THFloatTensor *r__, *m1_, *m2_;

	if( (m1->nDimension != 2) || (m2->nDimension != 2))
		THError("matrices expected, got %dD, %dD tensors", m1->nDimension, m2->nDimension);

	if(m1->size[1] != m2->size[0])
		THError("size mismatch, m1: %ld, m2: %ld", m1->size[1], m2->size[0]);

	if( t->nDimension != 2 )
		THError("matrix expected, got %dD tensor for t", t->nDimension);

	if( (t->size[0] != m1->size[0]) || (t->size[1] != m2->size[1]) )
		THError("size mismatch, t: %ld, m1: %ld, t: %ld, m2: %ld", t->size[0], m1->size[1], t->size[1], m2->size[1]);

	if(t != r_)
		THError("Not implemented: t != r");

	/*  printf("%ldx%ld = %ldx%ld X %ldx%ld\n", r_->size[0], r_->size[1], m1->size[0], m1->size[1], m2->size[0], m2->size[1]); */

	/* r_ */
	if(r_->stride[0] == 1 && r_->stride[1] != 0)
	{
		transpose_r = 'n';
		r__ = r_;
	}
	else if(r_->stride[1] == 1 && r_->stride[0] != 0)
	{
		THFloatTensor *swap = m2;
		m2 = m1;
		m1 = swap;
		transpose_r = 't';
		r__ = r_;
	}
	else
	{
		THError("Transpose not implemented (1)");
		return;
/*		transpose_r = 'n';

		r__ = THFloatTensor_newWithSize2d(r_->size[1], r_->size[0]);
		THFloatTensor_copy(r__, r_);
		THFloatTensor_transpose(r__, NULL, 0, 1);*/
	}

	/* m1 */
	if(m1->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m1->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
	{
		transpose_m1 = 'n';
		m1_ = m1;
	}
	else if(m1->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m1->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
	{
		transpose_m1 = 't';
		m1_ = m1;
	}
	else
	{
		THError("Transpose not implemented (2)");
		return;
		/*transpose_m1 = (transpose_r == 'n' ? 't' : 'n');
		m1_ = THFloatTensor_newContiguous(m1);*/
	}

	/* m2 */
	if(m2->stride[(transpose_r == 'n' ? 0 : 1)] == 1 && m2->stride[(transpose_r == 'n' ? 1 : 0)] != 0)
	{
		transpose_m2 = 'n';
		m2_ = m2;
	}
	else if(m2->stride[(transpose_r == 'n' ? 1 : 0)] == 1 && m2->stride[(transpose_r == 'n' ? 0 : 1)] != 0)
	{
		transpose_m2 = 't';
		m2_ = m2;
	}
	else
	{
		THError("Transpose not implemented (3)");
		return;
		/*transpose_m2 = (transpose_r == 'n' ? 't' : 'n');
		m2_ = THFloatTensor_(newContiguous)(m2);*/
	}

	/* do the operation */
	THBlas_gemm(transpose_m1,
		transpose_m2,
		r__->size[(transpose_r == 'n' ? 0 : 1)],
		r__->size[(transpose_r == 'n' ? 1 : 0)],
		m1_->size[(transpose_r == 'n' ? 1 : 0)],
		alpha,
		THFloatTensor_data(m1_),
		(transpose_m1 == 'n' ? m1_->stride[(transpose_r == 'n' ? 1 : 0)] : m1_->stride[(transpose_r == 'n' ? 0 : 1)]),
		THFloatTensor_data(m2_),
		(transpose_m2 == 'n' ? m2_->stride[(transpose_r == 'n' ? 1 : 0)] : m2_->stride[(transpose_r == 'n' ? 0 : 1)]),
		beta,
		THFloatTensor_data(r__),
		r__->stride[(transpose_r == 'n' ? 1 : 0)]);

	/* free intermediate variables */
	if(m1_ != m1)
		THFloatTensor_free(m1_);

	if(m2_ != m2)
		THFloatTensor_free(m2_);

	if(r__ != r_)
		THError("freeCopyTo not implemented");
		/*THFloatTensor_(freeCopyTo)(r__, r_);*/
}