matrix_t matrixMulUnrolled(const matrix_t & matrixA, const matrix_t & matrixB) { auto dimension = matrixA.size(); assert(matrixA.size() == dimension); assert(matrixA[0].size() == dimension); assert(matrixB.size() == dimension); assert(matrixB[0].size() == dimension); matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix const int vec{4}; int start{0}; if(dimension > vec) { for(int x{0}; x < dimension; ++x) //for(int i{0}; i < dimension; ++i) for(int i{0}; i < dimension - vec; i += vec) for(int y{0}; y < dimension; ++y) //for(int y{0}; y < dimension - vec; y += vec) for(int t{0}; t < vec; ++t) matrixC[x][y] += matrixA[x][i+t] * matrixB[i+t][y]; //matrixC[x][y+t] += matrixA[x][i] * matrixB[i][y+t]; start = dimension - dimension % vec; } for(int x{0}; x < dimension; ++x) //for(int i{0}; i < dimension; ++i) for(int i{start}; i < dimension; ++i) for(int y{0}; y < dimension; ++y) //for(int y = start; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; return matrixC;//move semantics ftw }
void multi2(const matrix_t &A, const matrix_t &B, matrix_t &C) { size_t n = A.size(); float **__restrict__ const da = A.data; float **__restrict__ const db = B.data; float **__restrict__ dc = C.data; const size_t chunk_size = 8; const size_t chunks = n / chunk_size; #pragma omp parallel for num_threads(8) for(size_t i = 0; i < n; ++i) { __m256 a_line, b_line, c_line, r_line; for(size_t k = 0; k < n; ++k) { float c = da[i][k]; a_line = _mm256_set_ps(c, c, c, c, c, c, c, c); for(size_t j = 0; j < chunks; ++j) { float mc[32] __attribute__((aligned(32))); b_line = _mm256_load_ps(&db[k][j * chunk_size]); c_line = _mm256_load_ps(&dc[i][j * chunk_size]); r_line = _mm256_mul_ps(a_line, b_line); r_line = _mm256_add_ps(r_line, c_line); _mm256_store_ps(&dc[i][j * chunk_size], r_line); } for(size_t j = chunk_size * chunks; j < n; ++j) { dc[i][j] += c * db[k][j]; } } } }
bool search(matrix_t &matrix, int x, int y, string needle, int start_index, bool scan) { int x_max = matrix.size() - 1; int y_max = matrix[0].size() - 1; if (x < 0 || x > x_max) return false; if (y < 0 || y > y_max) return false; if (needle.size() == start_index) { return true; } if (matrix[x][y] == needle[start_index]) { int original_char = matrix[x][y]; matrix[x][y] = '.'; for (int i = 0; i < 8; ++i) { bool found = search(matrix, x + search_x[i], y + search_y[i], needle, start_index + 1, false); if (found) return true; } matrix[x][y] = original_char; } if (scan) { if (y < y_max) { return search(matrix, x, y + 1, needle, start_index, true); } else if (x < x_max) { return search(matrix, x + 1, 0, needle, start_index, true); } } return false; }
matrix_t matrixMul(const matrix_t & matrixA, const matrix_t & matrixB) { auto dimension = matrixA.size(); assert(matrixA.size() == dimension); assert(matrixA[0].size() == dimension); assert(matrixB.size() == dimension); assert(matrixB[0].size() == dimension); matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix for(int x{0}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) for(int y{0}; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; return matrixC;//move semantics ftw }
matrix_t matrixMulTiled(const matrix_t & matrixA, const matrix_t & matrixB) { auto dimension = matrixA.size(); assert(matrixA.size() == dimension); assert(matrixA[0].size() == dimension); assert(matrixB.size() == dimension); assert(matrixB[0].size() == dimension); matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix const int m{8};//256bit const size_t n{dimension - dimension % m}; int start{0}; if(n >= m) { for (int i{0}; i < n; i+=m) for (int j{0}; j < n; j+=m) for (int k{0}; k < n; k+=m) for (int s{0}; s < m; s++) for (int t{0}; t < m; t++) for (int u{0}; u < m; u++) matrixC[i + s][j + t] += matrixA[i + s][k + u] * matrixB[k + u][j + t]; start = n; } //finalize calculations within tiles for(int x{0}; x < n; ++x) for(int i{start}; i < dimension; ++i) for(int y{0}; y < n; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; //calculate remaining rows for(int x{start}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) for(int y{0}; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; //calculate remaining elements (remaining columns without already calculated rows) for(int x{0}; x < n; ++x) for(int i{0}; i < dimension; ++i) for(int y{start}; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; return matrixC;//move semantics ftw }
void Print(const matrix_t& m) { for (unsigned i = 0; i < m.size(); i++) { for (unsigned j = 0; j < m[i].size(); j++) { cout << m[i][j] << " "; } cout << endl; } }
matrix_t gaussSeidelIteration(const matrix_t & grid) { auto dimension = grid.size(); assert(grid[0].size() == dimension); auto gridCopy = grid; for(int x{1}; x < dimension - 1; ++x) for(int y{1}; y < dimension - 1; ++y) gridCopy[x][y] += 0.25 * (grid[x-1][y] + grid[x][y-1] + grid[x][y+1] + grid[x+1][y]); return gridCopy; }
// Transpose of a matrix void transposeMatrix(matrix_t & M) { int rM = M.size(); int cM = M[1].size(); matrix_t tM; sizeMatrix(tM,cM,rM); for (int r=0; r<cM; r++) { for (int c=0; c<rM; c++) { tM[r][c] = M[c][r]; } } M = tM; }
void Multiply(const matrix_t& m1, const matrix_t& m2, matrix_t& res) { assert(m1.size() > 0 && m2.size() > 0 && m1[0].size() == m2.size()); int m = m1.size(); int n = m2.size(); int p = m2[0].size(); res.resize(m); for (int i = 0; i < m; i++) { res[i].resize(p); for (int j = 0; j < n; j++) { res[i][j] = 0; for (int k = 0; k < p; k++) { res[i][j] += m1[i][k] * m2[k][j]; } } } }
void gaussj(matrix_t & a, matrix_t & b) { int i,icol,irow,j,k,l,ll; double big,dum,pivinv; int n=a.size(); int m=b[0].size(); vector_t indxc(n),indxr(n),ipiv(n); for (j=0;j<n;j++) ipiv[j]=0; for (i=0;i<n;i++) { big=0.0; for (j=0;j<n;j++) if (ipiv[j] != 1) for (k=0;k<n;k++) { if (ipiv[k] == 0) { if (fabs(a[j][k]) >= big) { big=fabs(a[j][k]); irow=j; icol=k; } } } ++(ipiv[icol]); if (irow != icol) { for (l=0;l<n;l++) SWAP(a[irow][l],a[icol][l]); for (l=0;l<m;l++) SWAP(b[irow][l],b[icol][l]); } indxr[i]=irow; indxc[i]=icol; if (a[icol][icol] == 0.0) error("gaussj: Singular Matrix"); pivinv=1.0/a[icol][icol]; a[icol][icol]=1.0; for (l=0;l<n;l++) a[icol][l] *= pivinv; for (l=0;l<m;l++) b[icol][l] *= pivinv; for (ll=0;ll<n;ll++) if (ll != icol) { dum=a[ll][icol]; a[ll][icol]=0.0; for (l=0;l<n;l++) a[ll][l] -= a[icol][l]*dum; for (l=0;l<m;l++) b[ll][l] -= b[icol][l]*dum; } } for (l=n-1;l>=0;l--) { if (indxr[l] != indxc[l]) for (k=0;k<n;k++) SWAP(a[k][(int)indxr[l]],a[k][(int)indxc[l]]); } }
void multi1(const matrix_t &A, const matrix_t &B, matrix_t &C) { size_t n = A.size(); float **__restrict__ const da = A.data; float **__restrict__ const db = B.data; float **__restrict__ dc = C.data; #pragma omp parallel for num_threads(8) for(size_t i = 0; i < n; ++i) { for(size_t k = 0; k < n; ++k) { float c = da[i][k]; for(size_t j = 0; j < n; ++j) dc[i][j] += c * db[k][j]; } } }
void generate_matrix(matrix_t &A, matrix_t &B, unsigned int seed) { srand(seed); size_t n = A.size(); float **da = A.data, **db = B.data; mt19937 rd(seed); normal_distribution<> norm; for(size_t i = 0; i < n; ++i) { for(size_t j = 0; j < n; ++j) { da[i][j] = norm(rd); db[i][j] = norm(rd); } } }
template<typename T> std::pair<matrix_t<T>, matrix_t<T>> stat_analysis (std::function<matrix_t<T>(matrix_t<T>&, matrix_t<T>&)> method, matrix_t<T> A, matrix_t<T> b, int iter) { size_t n = b.size(); matrix_t<T> average = make_matrix<T>(n, b[0].size()); matrix_t<T> results = make_matrix<T>(n, iter); matrix_t<T> deviation = make_matrix<T>(n, b[0].size()); matrix_t<T> temp = make_matrix<T>(n, b[0].size()); #if defined(_OPENMP) #pragma omp parallel for #endif for (size_t i = 0 ; i < iter ; i++) { temp = method (A, b); for (size_t j = 0 ; j < n ; j++) { if (isnan(temp[j][0])) temp[j][0] = 0.; results[j][i] = temp[j][0]; } } for (size_t i = 0 ; i < n ; i++) { for (size_t j = 0 ; j < iter ; j++) { average[i][0] += results[i][j]; } average[i][0] /= iter; for (size_t j = 0 ; j < iter ; j++) { deviation[i][0] += (results[i][j] - average[i][0]) * (results[i][j] - average[i][0]); } deviation[i][0] /= (static_cast<T>(iter - 1)); deviation[i][0] = sqrt(deviation[i][0])/sqrt(iter); } return std::make_pair (average, deviation); }
static matrix_t dot(const matrix_t& M) { matrix_t final_matrix = M; size_t size = M.size(); unsigned int k, i, j; for (k = 0; k < size; k++) { for (i = 0; i < size; i++) { for (j = 0; j < size; j++) { if (final_matrix[i][j] == 0 || final_matrix[i][k] + final_matrix[k][j] == 0) { final_matrix[i][j] = 0; } else if (1/final_matrix[i][j] + 1/(final_matrix[i][k] + final_matrix[k][j]) == 0) { final_matrix[i][j] = std::numeric_limits<double>::infinity(); } else { final_matrix[i][j] = 1/(1/final_matrix[i][j] + 1/(final_matrix[i][k] + final_matrix[k][j])); } } } } return final_matrix; }
matrix_t matrixMulFMA(const matrix_t & matrixA, const matrix_t & matrixB) { auto dimension = matrixA.size(); assert(matrixA.size() == dimension); assert(matrixA[0].size() == dimension); assert(matrixB.size() == dimension); assert(matrixB[0].size() == dimension); matrix_t matrixC(dimension, typename matrix_t::value_type(dimension, 0));//0ed matrix const int vec = 8; int start{0}; if(dimension > vec) { start = dimension - dimension % vec; for(int x{0}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) { const __m256 a = _mm256_set_ps(matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i], matrixA[x][i]);// unaligned read for(int y{0}; y < dimension - vec; y += vec) { //__m256 c = _mm256_set_ps(matrixC[x][y+7], matrixC[x][y+6], matrixC[x][y+5], matrixC[x][y+4], matrixC[x][y+3], matrixC[x][y+2], matrixC[x][y+1], matrixC[x][y+0]); //const __m256 b = _mm256_set_ps(matrixB[i][y+7], matrixB[i][y+6], matrixB[i][y+5], matrixB[i][y+4], matrixB[i][y+3], matrixB[i][y+2], matrixB[i][y+1], matrixB[i][y+0]); __m256 c = *reinterpret_cast<__m256*>(&matrixC[x][y]);// aligned read const __m256 & b = *reinterpret_cast<const __m256*>(&matrixB[i][y]);// aligned read c = _mm256_fmadd_ps(a, b, c);//c = a * b + c; //_mm256_store_ps(&matrixC[x][y], c);//aligned //_mm256_storeu_ps(&matrixC[x][y], c);//unaligned /* float c[8]; c[0] = matrixC[i][y+0]; c[1] = matrixC[i][y+1]; c[2] = matrixC[i][y+2]; c[3] = matrixC[i][y+3]; c[4] = matrixC[i][y+4]; c[5] = matrixC[i][y+5]; c[6] = matrixC[i][y+6]; c[7] = matrixC[i][y+7]; c[0] += matrixA[x][i] * matrixB[i][y+0]; c[1] += matrixA[x][i] * matrixB[i][y+1]; c[2] += matrixA[x][i] * matrixB[i][y+2]; c[3] += matrixA[x][i] * matrixB[i][y+3]; c[4] += matrixA[x][i] * matrixB[i][y+4]; c[5] += matrixA[x][i] * matrixB[i][y+5]; c[6] += matrixA[x][i] * matrixB[i][y+6]; c[7] += matrixA[x][i] * matrixB[i][y+7]; //*/ //* matrixC[x][y+0] = c[0]; matrixC[x][y+1] = c[1]; matrixC[x][y+2] = c[2]; matrixC[x][y+3] = c[3]; matrixC[x][y+4] = c[4]; matrixC[x][y+5] = c[5]; matrixC[x][y+6] = c[6]; matrixC[x][y+7] = c[7]; //*/ /*is doing this matrixC[x][y+0] += matrixA[x][i] * matrixB[i][y+0]; matrixC[x][y+1] += matrixA[x][i] * matrixB[i][y+1]; matrixC[x][y+2] += matrixA[x][i] * matrixB[i][y+2]; matrixC[x][y+3] += matrixA[x][i] * matrixB[i][y+3]; matrixC[x][y+4] += matrixA[x][i] * matrixB[i][y+4]; matrixC[x][y+5] += matrixA[x][i] * matrixB[i][y+5]; matrixC[x][y+6] += matrixA[x][i] * matrixB[i][y+6]; matrixC[x][y+7] += matrixA[x][i] * matrixB[i][y+7]; //*/ } } } //calculate remaining columns for(int x{0}; x < dimension; ++x) for(int i{0}; i < dimension; ++i) for(int y{start}; y < dimension; ++y) matrixC[x][y] += matrixA[x][i] * matrixB[i][y]; return matrixC;//move semantics ftw }