static void rowSums(TMat<TNumMat> mat, Vec<TNumVec> vec, int nthreads){ if (mat.nrow != vec.len) throw std::invalid_argument("provided vector has invalid length"); int nrow = mat.nrow; int ncol = mat.ncol; #pragma omp parallel num_threads(std::max(1, nthreads)) { std::vector<TNumVec> acc(nrow, 0); TNumVec* accBegin = acc.data(); #pragma omp for schedule(static) nowait for (int col = 0; col < ncol; ++col){ TNumMat* matCol = mat.colptr(col); TNumVec* accIter = accBegin; for (int row = 0; row < nrow; ++row){//this loop should be unrolled... *accIter++ += *matCol++; } } #pragma omp critical { for (int row = 0; row < nrow; ++row){ vec[row] += acc[row]; } } } }
static void colSums(TMat<TNumMat> mat, Vec<TNumVec> vec, int nthreads){ if (mat.ncol != vec.len) throw std::invalid_argument("provided vector has invalid length"); TNumVec* cs = vec.ptr; int nrow = mat.nrow; int ncol = mat.ncol; #pragma omp parallel for schedule(static) num_threads(std::max(1, nthreads)) for (int col = 0; col < ncol; ++col){ TNumMat* ptr = mat.colptr(col); TNumMat tmp = 0; for (int row = 0; row < nrow; ++row){ tmp += *ptr++; } cs[col] = tmp; } }