Example #1
0
/*
Saves matrix to file

@param filename filename for the txt output
@param mat matrix to be outputted
*/
void	saveMatrix(const std::string filename, const Matrix& mat){
	std::ofstream	fOut(filename);
	if (!fOut) {
		std::cout << "Error opening file: " << filename << std::endl;
		exit(EXIT_FAILURE);
	}

	if(!(fOut << mat.getDimM() << " " << mat.getDimN() << std::endl))
	{
		std::cout << "Error in writing matrix entries!" << std::endl;
		exit(EXIT_FAILURE);
	}

	for (int m = 0; m < mat.getDimM(); m++) {
		for (int n = 0; n < mat.getDimN(); n++) {
			if(!(fOut << mat(m, n) << std::endl))
			{
				std::cout << "Error in writing matrix entries!" << std::endl;
				exit(EXIT_FAILURE);
			}
		}
	}

	fOut.close();
}
Example #2
0
/*
debug output of matrix
*/
void	printMatrixT(const Matrix& mat){
	for (int m = 0; m < mat.getDimM(); m++) {
		for (int n = 0; n < mat.getDimN(); n++) {
			std::cout << mat.T(m, n) << " ";
		}
		std::cout << std::endl;
	}
}
Example #3
0
/*
convenience function for Matrix Matrix Multiplication
@param A input matrix
@param B input matrix (COLUMN MAJOR)
@param C output matrix
*/
inline
void	MMM(Matrix& A, Matrix& B, Matrix& C){
	/*int	LD = A.dimRows;
	if (LD<A.dimCols) LD = A.dimCols;
	if (LD<B.dimCols) LD = B.dimCols;

	LD--;
	LD |= LD >> 1;
	LD |= LD >> 2;
	LD |= LD >> 4;
	LD |= LD >> 8;
	LD |= LD >> 16;
	LD++;*/

	double* temp = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD);

	Matrix	BT(temp, temp, B.getDimN(), B.getDimM(), 0, 0);
	
	Matrix	P( (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD), nullptr, A.getDimM(), A.getDimM(), 0, 0);
	Matrix	PS( (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD), nullptr, A.getDimM(), A.getDimM(), 0, 0);
	Matrix	S( (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD), nullptr, A.getDimM(), A.getDimM(), 0, 0);
	Matrix	T(nullptr, (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD), A.getDimM(), A.getDimM(), 0, 0);

	A.dimRows = LD - PADDING;

	transpose(B, BT);

	if (A.dimRows<TRUNCATION_POINT){
		naive(A, BT, C);
	} else {
		strassen(A, BT, C, P, PS, S, T);
	}

	free(BT.data);
	free(P.data);
	free(PS.data);
	free(S.data);
	free(T.dataT);
}
Example #4
0
/*
Transpose Matrix M and write result to Matrix MT

M is not changed
@param M input matrix
@param MT output matrix
*/
inline
void	transpose(const Matrix& M, Matrix& MT){
	int dimM = M.getDimM();
	int dimN = M.getDimN();
	//transpose b

	for (int m = 0; m < dimM; m+=BLOCK_SIZE){				///rows of b
		for (int n = 0; n < dimN; n+=4){			///cols of b	
			for (int i = m; i<m+BLOCK_SIZE; ++i){
				__m256d*	pM = M.get(i, n);
				MT(n, i) = (*pM)[0];
				MT(n+1, i) = (*pM)[1];
				MT(n+2, i) = (*pM)[2];
				MT(n+3, i) = (*pM)[3];
				pM++;
				
			}
		}
	}
}
Example #5
0
/*
Naive implementation of Matrix Matrix Multiplication

@param A input matrix
@param B input matrix
@param C output matrix
*/
inline
void	naive(const Matrix& A, const Matrix& B, Matrix& C){
	//preload dimensions for faster access
	int dimM = C.getDimM();
	int dimN = C.getDimN();
	int dimL = A.getDimN();
	
	for (int m = 0; m < dimM; m+=4){				///rows of c
		for (int n = 0; n < dimN; n+=4){			///cols of c	
			//do calculation of a 4x4 block
			//std::cout << m << "\t" << n << std::endl;
			__m256d*	pA = A.get(m, 0);
			__m256d*	pB = A.get(m+1, 0);
			__m256d*	pC = A.get(m+2, 0);
			__m256d*	pD = A.get(m+3, 0);
			__m256d*	pK = B.getT(0, n);
			__m256d*	pL = B.getT(0, n+1);
			__m256d*	pM = B.getT(0, n+2);
			__m256d*	pN = B.getT(0, n+3);
			//std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl;
			__m256d		K = _mm256_setzero_pd();
			__m256d		L = _mm256_setzero_pd();
			__m256d		M = _mm256_setzero_pd();
			__m256d		N = _mm256_setzero_pd();
			__m256d		O = _mm256_setzero_pd();
			__m256d		P = _mm256_setzero_pd();
			__m256d		Q = _mm256_setzero_pd();
			__m256d		R = _mm256_setzero_pd();
			__m256d		S = _mm256_setzero_pd();
			__m256d		T = _mm256_setzero_pd();
			__m256d		U = _mm256_setzero_pd();
			__m256d		V = _mm256_setzero_pd();
			__m256d		W = _mm256_setzero_pd();
			__m256d		X = _mm256_setzero_pd();
			__m256d		Y = _mm256_setzero_pd();
			__m256d		Z = _mm256_setzero_pd();
			for (int l = 0; l < dimL; l+=4){
				//std::cout <<"mul" << std::endl;
				K = K + (*pA) * (*pK);
				L = L + (*pA) * (*pL);
				M = M + (*pA) * (*pM);
				N = N + (*pA) * (*pN);
				O = O + (*pB) * (*pK);
				P = P + (*pB) * (*pL);
				Q = Q + (*pB) * (*pM);
				R = R + (*pB) * (*pN);
				S = S + (*pC) * (*pK);
				T = T + (*pC) * (*pL);
				U = U + (*pC) * (*pM);
				V = V + (*pC) * (*pN);
				W = W + (*pD) * (*pK);
				X = X + (*pD) * (*pL);
				Y = Y + (*pD) * (*pM);
				Z = Z + (*pD) * (*pN);
				//std::cout << "inc" <<std::endl;
				pA++;
				pB++;
				pC++;
				pD++;
				pK++;
				pL++;
				pM++;
				pN++;
			}
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			__m256d sumab = _mm256_hadd_pd(K, L);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			__m256d sumcd = _mm256_hadd_pd(M, N);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			__m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			__m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			__m256d sum =  _mm256_add_pd(perm, blend);

			C.set(m, n, sum);
			//C(m  , n)     = K[0] + K[1] + K[2] + K[3];
			//C(m  , n+1)   = L[0] + L[1] + L[2] + L[3];
			//C(m  , n+2)   = M[0] + M[1] + M[2] + M[3];
			//C(m  , n+3)   = N[0] + N[1] + N[2] + N[3];

			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(O, P);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Q, R);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+1, n, sum);
			//C(m+1, n  )   = O[0] + O[1] + O[2] + O[3];
			//C(m+1, n+1)   = P[0] + P[1] + P[2] + P[3];
			//C(m+1, n+2)   = Q[0] + Q[1] + Q[2] + Q[3];
			//C(m+1, n+3)   = R[0] + R[1] + R[2] + R[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(S, T);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(U, V);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+2, n, sum);
			//C(m+2, n  )   = S[0] + S[1] + S[2] + S[3];
			//C(m+2, n+1)   = T[0] + T[1] + T[2] + T[3];
			//C(m+2, n+2)   = U[0] + U[1] + U[2] + U[3];
			//C(m+2, n+3)   = V[0] + V[1] + V[2] + V[3];
			
			// {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]}
			sumab = _mm256_hadd_pd(W, X);
			// {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]}
			sumcd = _mm256_hadd_pd(Y, Z);

			// {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]}
			blend = _mm256_blend_pd(sumab, sumcd, 0b1100);
			// {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]}
			perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21);

			sum =  _mm256_add_pd(perm, blend);
			
			C.set(m+3, n, sum);
			
			//C(m+3, n  )   = W[0] + W[1] + W[2] + W[3];
			//C(m+3, n+1)   = X[0] + X[1] + X[2] + X[3];
			//C(m+3, n+2)   = Y[0] + Y[1] + Y[2] + Y[3];
			//C(m+3, n+3)   = Z[0] + Z[1] + Z[2] + Z[3];
		}
	}
}
Example #6
0
int main(int argc, char **argv) {
	///******************************************************
	///********************** INPUT *************************
	///******************************************************
	if (argc != 4) {
		std::cout << "Invalid number of arguments!" << std::endl;
		std::cout << "./compare A.out B.out" << std::endl;
		exit(EXIT_FAILURE);
	}

	//matrix dimensions
	int dimM = 0;
	int dimN = 0;
	int dimO = 0;

	//get dimensions
	std::ifstream	fIn(argv[1]);
	if (!fIn) {
		std::cout << "Error opening file: " << argv[1] << std::endl;
		exit(EXIT_FAILURE);
	}

	if(!(fIn >> dimM >> dimN))
	{
		std::cout << "Error in reading matrix entries!" << std::endl;
		exit(EXIT_FAILURE);
	}
	fIn.close();

	fIn.open(argv[2]);
	if (!fIn) {
		std::cout << "Error opening file: " << argv[2] << std::endl;
		exit(EXIT_FAILURE);
	}

	if(!(fIn >> dimN >> dimO))
	{
		std::cout << "Error in reading matrix entries!" << std::endl;
		exit(EXIT_FAILURE);
	}

	fIn.close();

	//calculate minimal matrix size
	//all matrices are padded with 0s to this size
	//should be power of 2 for efficient block division
	//dirty hack...
	LD = 64;
	if (LD<dimM) LD = dimM;
	if (LD<dimN) LD = dimN;
	if (LD<dimO) LD = dimO;

	LD--;
	LD |= LD >> 1;
	LD |= LD >> 2;
	LD |= LD >> 4;
	LD |= LD >> 8;
	LD |= LD >> 16;
	LD++;

	//add useless padding
	LD += PADDING;

	double* a = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD);
	double* b = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD);
	double* c = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD);

	Matrix	A = loadMatrix(argv[1], &a[0]);
	Matrix	B = loadMatrix(argv[2], &b[0]);
	Matrix	C(&c[0], nullptr, A.getDimM(), B.getDimN(), 0, 0);

	///******************************************************
	///********************** CALCULATION *******************
	///******************************************************
	double time = 0;
	
#ifdef USE_LIKWID
	likwid_markerInit();
	likwid_markerStartRegion("dummy");
#endif

	siwir::Timer	timer;

	MMM(A, B, C);

	time = timer.elapsed();
	std::cout << dimM << "\t" << dimN << "\t" << dimO << "\t" << time << std::endl;

#ifdef USE_LIKWID
	likwid_markerStopRegion("dummy");
	likwid_markerClose();
#endif

	///******************************************************
	///********************** OUTPUT ************************
	///******************************************************

	saveMatrix(argv[3], C);

	free(a);
	free(b);
	free(c);
};
Example #7
0
/*
Strassen implementation of Matrix Matrix Multiplication

@param A input row major matrix
@param B input COLUMN MAJOR matrix
@param C output row major matrix
@param P temporary row major matrix
@param Ps temporary row major matrix
@param S temporary row major matrix
@param T temporary COLUMN MAJOR matrix
*/
void	strassen(Matrix& A, Matrix& B, Matrix& C, Matrix& P, Matrix& Ps, Matrix& S, Matrix& T){
	//get matrix dimensions and calculate size of blocks
	int	dim = A.getDimM(); //equal for all matrix dimensions
	int	dim2 = dim * 0.5;
	//std::cout << dim << "\t" << dim2 << std::endl;	
	
	//get blocks
	Matrix	A1 = A.getSubMatrix(0, 0, dim2, dim2);
	Matrix	A2 = A.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	A3 = A.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	A4 = A.getSubMatrix(dim2, dim2, dim2, dim2);
	
	Matrix	B1 = B.getSubMatrix(0, 0, dim2, dim2);
	Matrix	B2 = B.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	B3 = B.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	B4 = B.getSubMatrix(dim2, dim2, dim2, dim2);
	
	Matrix	C1 = C.getSubMatrix(0, 0, dim2, dim2);
	Matrix	C2 = C.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	C3 = C.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	C4 = C.getSubMatrix(dim2, dim2, dim2, dim2);
	
	Matrix	S1 = S.getSubMatrix(0, 0, dim2, dim2);
	Matrix	S2 = S.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	S3 = S.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	S4 = S.getSubMatrix(dim2, dim2, dim2, dim2);
	
	
	Matrix	T1 = T.getSubMatrix(0, 0, dim2, dim2);
	Matrix	T2 = T.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	T3 = T.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	T4 = T.getSubMatrix(dim2, dim2, dim2, dim2);
		
	Matrix	P1 = P.getSubMatrix(0, 0, dim2, dim2);
	Matrix	P2 = P.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	P3 = P.getSubMatrix(dim2, 0, dim2, dim2);
	Matrix	P4 = P.getSubMatrix(dim2, dim2, dim2, dim2);

	Matrix	P5 = Ps.getSubMatrix(0, 0, dim2, dim2);
	Matrix	P6 = Ps.getSubMatrix(0, dim2, dim2, dim2);
	Matrix	P7 = Ps.getSubMatrix(dim2, 0, dim2, dim2);
	//Matrix	P8 = Ps.getSubMatrix(dim2, dim2, dim2, dim2);
	//std::cout << "submatrices" << std::endl;
	
	//compute temporary S and T matrices
	for (int i=0; i<dim2; ++i){
		for (int j=0; j<dim2; j+=4){
			//std::cout << "S" << std::endl;
			__m256d*	pA1 = A1.get(i, j);
			__m256d*	pA2 = A2.get(i, j);
			__m256d*	pA3 = A3.get(i, j);
			__m256d*	pA4 = A4.get(i, j);
			__m256d*	pS2 = S2.get(i, j);
			S1.set(i, j, (*pA3)+(*pA4));
			S2.set(i, j, (*pA3)+(*pA4)-(*pA1));
			S3.set(i, j, (*pA1)-(*pA3));
			S4.set(i, j, (*pA2)-(*pS2));
			pA1++;
			pA2++;
			pA3++;
			pA4++;
			pS2++;
			//S1(i, j) = A3(i, j) + A4(i, j);
			//S2(i, j) = S1(i, j) - A1(i, j);
			//S3(i, j) = A1(i ,j) - A3(i, j);
			//S4(i, j) = A2(i, j) - S2(i, j);
			//std::cout << "T" << std::endl;
			__m256d*	pB1 = B1.getT(j, i);
			__m256d*	pB2 = B2.getT(j, i);
			__m256d*	pB3 = B3.getT(j, i);
			__m256d*	pB4 = B4.getT(j, i);
			__m256d*	pT2 = T2.getT(j, i);
			T1.setT(j, i, (*pB2)-(*pB1));
			T2.setT(j, i, (*pB4)-((*pB2)-(*pB1)));
			T3.setT(j, i, (*pB4)-(*pB2));
			T4.setT(j, i, (*pB3)-(*pT2));
			pB1++;
			pB2++;
			pB3++;
			pB4++;
			pT2++;
			//T1.T(j, i) = B2.T(j, i) - B1.T(j, i);
			//T2.T(j, i) = B4.T(j, i) - T1.T(j, i);
			//T3.T(j, i) = B4.T(j ,i) - B2.T(j, i);
			//T4.T(j, i) = B3.T(j, i) - T2.T(j, i);

		}
	
	}

	//calculate products
	if (dim2<TRUNCATION_POINT) {
		naive(A1, B1, P1);
		naive(A2, B3, P2);
		naive(S1, T1, P3);
		naive(S2, T2, P4);
		naive(S3, T3, P5);
		naive(S4, B4, P6);
		naive(A4, T4, P7);
	} else {
		strassen(A1, B1, P1, C1, C2, C3, B2);
		strassen(A2, B3, P2, C1, C2, C3, B2);
		strassen(S1, T1, P3, C1, C2, C3, B2);
		strassen(S2, T2, P4, C1, C2, C3, B2);
		strassen(S3, T3, P5, C1, C2, C3, B2);
		strassen(S4, B4, P6, C1, C2, C3, B2);
		strassen(A4, T4, P7, C1, C2, C3, B2);
	}
	
	//assemble final matrix
	for (int i=0; i<dim2; ++i){
		for (int j=0; j<dim2; j+=4){
			__m256d*	pP1 = P1.get(i, j);
			__m256d*	pP2 = P2.get(i, j);
			__m256d*	pP3 = P3.get(i, j);
			__m256d*	pP4 = P4.get(i, j);
			__m256d*	pP5 = P5.get(i, j);
			__m256d*	pP6 = P6.get(i, j);
			__m256d*	pP7 = P7.get(i, j);
			C1.set(i, j, (*pP1) + (*pP2));
			C3.set(i, j, (*pP1) + (*pP4) + (*pP5) + (*pP7));
			C4.set(i, j, (*pP1) + (*pP4) + (*pP5) + (*pP3));
			C2.set(i, j, (*pP1) + (*pP4) + (*pP3) + (*pP6));
			//C1(i, j) = P1(i, j) + P2(i, j);
			//C3(i, j) = P1(i, j) + P4(i, j) + P5(i, j) + P7(i, j);
			//C4(i, j) = P1(i, j) + P4(i, j) + P5(i, j) + P3(i, j);
			//C2(i, j) = P1(i, j) + P3(i, j) + P4(i, j) + P6(i, j);
			pP1++;
			pP2++;
			pP3++;
			pP4++;
			pP5++;
			pP6++;
			pP7++;
		}
	}
}