scottgs::FloatMatrix scottgs::MatrixMultiply::multiply(const scottgs::FloatMatrix& lhs, const scottgs::FloatMatrix& rhs) const
{
	// Verify acceptable dimensions
	if (lhs.size2() != rhs.size1())
		throw std::logic_error("matrix incompatible lhs.size2() != rhs.size1()");

	return boost::numeric::ublas::prod(lhs,rhs);
}
scottgs::FloatMatrix scottgs::MatrixMultiply::operator()(const scottgs::FloatMatrix& lhs, const scottgs::FloatMatrix& rhs) const
{
	// Verify acceptable dimensions
	if (lhs.size2() != rhs.size1())
		throw std::logic_error("matrix incompatible lhs.size2() != rhs.size1()");

	scottgs::FloatMatrix result(lhs.size1(),rhs.size2());


	// YOUR ALGORIHM WITH COMMENTS GOES HERE:
	
	int i, j, k;
	scottgs::FloatMatrix tMatrix = scottgs::MatrixMultiply::transpose(rhs);
	std::vector<float> vlhs = makeVector(lhs);
	std::vector<float> vrhs = makeVector(tMatrix);	
	
	for (i = 0; i < lhs.size1(); ++i)
	{
		for (j = 0; j < rhs.size2(); ++j)
		{
			for (k = 0; k < lhs.size2(); ++k)
			{
				//result(i, j) += lhs(i,k) * rhs(k,j);
				//result(i, j) += vlhs[k + i * lhs.size2()] * vrhs[j + k * rhs.size2() - 1];
				result(i, j) += vlhs[k + i * lhs.size2()] * vrhs[k + j * rhs.size1()];
			}
		}
	}



	return result;
}
scottgs::FloatMatrix scottgs::MatrixMultiply::transpose(const scottgs::FloatMatrix& matrix) const
{
	scottgs::FloatMatrix tMatrix(matrix.size2(), matrix.size1());
	int i, j;
	
	for (i = 0; i < matrix.size1(); ++i)
	{
		for (j = 0; j < matrix.size2(); ++j)
		{
			tMatrix(j, i) = matrix(i, j);
		}
	}
	
	return tMatrix;
}
std::vector<float> scottgs::MatrixMultiply::makeVectorTransposed(const scottgs::FloatMatrix& matrix) const
{
	std::vector<float> vmatrix;
	int i;
	int j;
	
	for (i = 0; i < matrix.size1(); ++i)
	{
		for (j = 0; j < matrix.size2(); ++j)
		{
			vmatrix.push_back(matrix(j, i));
		}
	}
	
	return vmatrix;
}
scottgs::FloatMatrix scottgs::MatrixMultiply::operator()(const scottgs::FloatMatrix& lhs, const scottgs::FloatMatrix& rhs) const
{
	// Verify acceptable dimensions
	if (lhs.size2() != rhs.size1())
		throw std::logic_error("matrix incompatible lhs.size2() != rhs.size1()");

	//create the matrix
	scottgs::FloatMatrix result(lhs.size1(),rhs.size2());
	
	//create the unsigned ints used for the three for loops
	unsigned int i,j,k,ii,jj,kk; //matrices indexes
	
	//get the sizes I need and put them into a constant
	const unsigned int m1_num_row = lhs.size1(); //# of row of matrix 1
	const unsigned int m1_num_col = lhs.size2(); //# of col of matrix 1
	const unsigned int m2_num_row = rhs.size1(); //# of row of matrix 2
	const unsigned int m2_num_col = rhs.size2(); //# of column of matrix 2
	
	//block size calculation
	//2 * (blockSize)^2 * 4 = 32768 (L1 cache)
	//SQRT(32768/8) = blockSize = 64
	
	const int block_size = 64;
	
	//get a reference of the matrix's first element ( this will be a pointer to the first element )
	const float *m1 = &lhs(0,0);
	const float *m2 = &rhs(0,0);
	
	//get a copy of the first element.
	float *r = &result(0,0);

	for (i = 0; i < m1_num_row; ++i)
		//loop through each column of matrix 2
		for (j = 0; j < m2_num_col; ++j)
			//loop through each column of matrix 1
			for (k = 0; k < m2_num_row; ++k)
				r[i*m2_num_col + j] = r[i*m2_num_col + j] + m1[i*m1_num_col + k] *m2[k*m2_num_col + j];
	
	return result;
}
scottgs::FloatMatrix scottgs::MatrixMultiply::operator()(const scottgs::FloatMatrix& lhs, const scottgs::FloatMatrix& rhs) const
{
	// Verify acceptable dimensions
	if (lhs.size2() != rhs.size1())
		throw std::logic_error("matrix incompatible lhs.size2() != rhs.size1()");

	scottgs::FloatMatrix result(lhs.size1(),rhs.size2());
	
	//create the unsigned ints used for the three for loops
	unsigned int i,j,k,ii,jj,kk; //matrices indexes
	
	//get the sizes I need and put them into a constant
	const unsigned int m1_num_row = lhs.size1(); //# of row of matrix 1
	const unsigned int m1_num_col = lhs.size2(); //# of col of matrix 1
	const unsigned int m2_num_row = rhs.size1(); //# of row of matrix 2
	const unsigned int m2_num_col = rhs.size2(); //# of column of matrix 2
	
	//block size calculation
	//2 * (blockSize)^2 * 4 = 32768 (L1 cache)
	//SQRT(32768/8) = blockSize = 64
	
	const int block_size = 64;
	
	//get a reference of the matrix's first element ( this will be a pointer to the first element )
	const float *m1 = &lhs(0,0);
	
	//malloc memory for the soon to be transposed matrix ( matrix #2)
	//float *transposed = (float*)malloc(sizeof(float)*m2_num_col*m2_num_row);
	std::vector<float> transposed;
	transposed.resize(m2_num_col*m2_num_row);
	
	//get a copy of the first element.
	float *r = &result(0,0);
	
	//transpose the second matrix
	for (j = 0; j < m2_num_col; ++j)
		for (i = 0; i < m2_num_row; ++i)
			transposed[j*m2_num_row + i] = m1[i*m2_num_col + j];
	
	if(m1_num_row > 100 || m1_num_col > 100 || m2_num_row > 100 || m2_num_col > 100){
		for (i = 0; i < m1_num_row; ++i)
			//loop through each column of matrix 2
			for (j = 0; j < m2_num_col; ++j)
				//loop through each column of matrix 1
				for (k = 0; k < m1_num_col; ++k)
					r[i*m2_num_col + j] = r[i*m2_num_col + j] + m1[i*m1_num_col + k] * transposed[j*m1_num_col + k];
	}else{
		//loop through each row of matrix 1
		for (ii = 0; ii < m1_num_row; ii+=block_size)
			//loop through each column of matrix 2
			for (jj = 0; jj < m2_num_col; jj+=block_size)
				//loop through each column of matrix 1
				for (kk = 0; kk < m1_num_col; kk+=block_size)
					for (i = ii; i < std::min(m1_num_row, ii+block_size); ++i)
						//loop through each column of matrix 2
						for (j = jj; j < std::min(m2_num_col, jj+block_size); ++j)
							//loop through each column of matrix 1
							for (k = kk; k < std::min(m1_num_col, kk+block_size); ++k)
								r[i*m2_num_col + j] = r[i*m2_num_col + j] + m1[i*m1_num_col + k] *transposed[j*m1_num_col + k];
	}
	
	return result;
}