void OverwriteCols(DenseMatrix<T>& A, const DenseMatrix<T>& B, const std::vector<unsigned int>& col_indices, const unsigned int num_cols) { const unsigned int height = A.Height(); // Overwrite columns of A with B. if (B.Height() != A.Height()) throw std::logic_error("OverwriteCols: height mismatch"); if (num_cols > static_cast<unsigned int>(A.Width())) throw std::logic_error("OverwriteCols: col indices out of bounds"); T* buf_a = A.Buffer(); const unsigned int ldim_a = A.LDim(); const T* buf_b = B.LockedBuffer(); const unsigned int ldim_b = B.LDim(); for (unsigned int c=0; c<num_cols; ++c) { unsigned int col_a = col_indices[c]; unsigned int offset_a = col_a*ldim_a; unsigned int offset_b = c*ldim_b; memcpy(&buf_a[offset_a], &buf_b[offset_b], height * sizeof(T)); } }
void SolveLDLt (const DenseMatrix & l, const Vector & d, const Vector & g, Vector & p) { double val; int n = l.Height(); p = g; for (int i = 0; i < n; i++) { val = 0; for (int j = 0; j < i; j++) val += p(j) * l(i,j); p(i) -= val; } for (int i = 0; i < n; i++) p(i) /= d(i); for (int i = n-1; i >= 0; i--) { val = 0; for (int j = i+1; j < n; j++) val += p(j) * l(j, i); p(i) -= val; } }
/// Construct a constant matrix coefficient times a scalar Coefficient MatrixFunctionCoefficient(const DenseMatrix &m, Coefficient &q) : MatrixCoefficient(m.Height(), m.Width()), Q(&q) { Function = NULL; TDFunction = NULL; mat = m; }
// ------------------------------------------------------------------------ // // Init function - call prior to start of iterations // // ------------------------------------------------------------------------ void Init(const Matrix<T>& A, DenseMatrix<T>& W, DenseMatrix<T>& H) { WtW.Resize(W.Width(), W.Width()); WtA.Resize(W.Width(), A.Width()); HHt.Resize(H.Height(), H.Height()); AHt.Resize(A.Height(), H.Height()); ScaleFactors.Resize(H.Height(), 1); // compute the kxk matrix WtW = W' * W Gemm(TRANSPOSE, NORMAL, T(1), W, W, T(0), WtW); // compute the kxn matrix WtA = W' * A Gemm(TRANSPOSE, NORMAL, T(1), W, A, T(0), WtA); }
void TopTerms(const int maxterms, const DenseMatrix<T>& V, // column vector (single col of W) std::vector<int>& sort_indices, std::vector<int>& term_indices) { // Sort the row indices for topic vector V into decreasing order. // Compare data elements to rearrange the indices. int height = V.Height(); if (sort_indices.size() < static_cast<unsigned int>(height)) throw std::runtime_error("TopTerms: index array too small"); if (term_indices.size() < static_cast<unsigned int>(maxterms)) throw std::runtime_error("TopTerms: term array too small"); const T* data = V.LockedBuffer(); // initialize the indices for the sort for (int q=0; q<height; ++q) sort_indices[q] = q; std::sort(&sort_indices[0], &sort_indices[0] + height, [&data](int i1, int i2) {return data[i1] > data[i2];}); size_t max_terms = std::min(maxterms, height); for (size_t q=0; q<max_terms; ++q) { int index = sort_indices[q]; assert(index >= 0); assert(index < height); term_indices[q] = index; } }
bool SolveNormalEq(const DenseMatrix<T>& LHS, // kxk const DenseMatrix<T>& RHS, // kxn DenseMatrix<T>& X) // kxn { // Solve LHS * X = RHS for X, where LHS is assumed SPD. if (LHS.Width() != LHS.Height()) throw std::logic_error("SolveNormalEq: expected square matrix on LHS"); if ( (X.Height() != LHS.Height()) || (X.Width() != RHS.Width())) throw std::logic_error("SolveNormalEq: non-conformant matrix X"); // copy the input, since the solver overwrites it DenseMatrix<T> M(LHS); X = RHS; return SolveNormalEq(M, X); }
void IsoparametricTransformation::Transform (const DenseMatrix &matrix, DenseMatrix &result) { MFEM_ASSERT(matrix.Height() == GetDimension(), "invalid input"); result.SetSize(PointMat.Height(), matrix.Width()); IntegrationPoint ip; Vector col; for (int j = 0; j < matrix.Width(); j++) { ip.Set(matrix.GetColumn(j), matrix.Height()); result.GetColumnReference(j, col); Transform(ip, col); } }
void Cholesky (const DenseMatrix & a, DenseMatrix & l, Vector & d) { // Factors A = L D L^T double x; int i, j, k; int n = a.Height(); // (*testout) << "a = " << a << endl; l = a; for (i = 1; i <= n; i++) { for (j = i; j <= n; j++) { x = l.Get(i, j); for (k = 1; k < i; k++) x -= l.Get(i, k) * l.Get(j, k) * d.Get(k); if (i == j) { d.Elem(i) = x; } else { l.Elem(j, i) = x / d.Get(k); } } } for (i = 1; i <= n; i++) { l.Elem(i, i) = 1; for (j = i+1; j <= n; j++) l.Elem(i, j) = 0; } /* // Multiply: (*testout) << "multiplied factors: " << endl; for (i = 1; i <= n; i++) for (j = 1; j <= n; j++) { x = 0; for (k = 1; k <= n; k++) x += l.Get(i, k) * l.Get(j, k) * d.Get(k); (*testout) << x << " "; } (*testout) << endl; */ }
void MultLDLt (const DenseMatrix & l, const Vector & d, const Vector & g, Vector & p) { /* int i, j, n; double val; n = l.Height(); p = g; for (i = 1; i <= n; i++) { val = 0; for (j = i; j <= n; j++) val += p.Get(j) * l.Get(j, i); p.Set(i, val); } for (i = 1; i <= n; i++) p.Elem(i) *= d.Get(i); for (i = n; i >= 1; i--) { val = 0; for (j = 1; j <= i; j++) val += p.Get(j) * l.Get(i, j); p.Set(i, val); } */ double val; int n = l.Height(); p = g; for (int i = 0; i < n; i++) { val = 0; for (int j = i; j < n; j++) val += p(j) * l(j, i); p(i) = val; } for (int i = 0; i < n; i++) p(i) *= d(i); for (int i = n-1; i >= 0; i--) { val = 0; for (int j = 0; j <= i; j++) val += p(j) * l(i, j); p(i) = val; } }
void OptimalActiveSetW(DenseMatrix<T>& W, // m x 2 const DenseMatrix<T>& HHt, // 2 x 2 const DenseMatrix<T>& AHt) // m x 2 { // Remove negative entries in each of the m rows of matrix W, but // do so in a manner that minimizes the overall objective function. // Each row can be considered in isolation. // // The problem for each row i of W, w = W(i,:), is as follows: // // min_{w>=0} |wH - a|^2 // // This is minimized when w'* = Ha'/(hh'), or w* = aH'/(hh'). // Expressed in terms of the individual elements, this becomes: // // w*[0] = ah'[0] / (h[0]h'[0]) // w*[1] = ah'[1] / (h[1]h'[1]) // If both elements of w* are nonnegative, this is the optimal solution. // If any elements are negative, the Rank2 algorithm is used to adjust // their values. int height = W.Height(); const T hht_00 = HHt.Get(0,0); const T hht_11 = HHt.Get(1,1); const T inv_hht_00 = T(1.0) / hht_00; const T inv_hht_11 = T(1.0) / hht_11; const T sqrt_hht_00 = sqrt(hht_00); const T sqrt_hht_11 = sqrt(hht_11); for (int i=0; i<height; ++i) { T v1 = AHt.Get(i, 0) * inv_hht_00; T v2 = AHt.Get(i, 1) * inv_hht_11; T vv1 = v1 * sqrt_hht_00; T vv2 = v2 * sqrt_hht_11; if (vv1 >= vv2) v2 = T(0); else v1 = T(0); if ( (W.Get(i, 0) <= T(0)) || (W.Get(i, 1) <= T(0))) { W.Set(i, 0, v1); W.Set(i, 1, v2); } } }
void MakeDiagonallyDominant(DenseMatrix<T>& M) { // Make the diagonal element larger than the row sum, to ensure that // the matrix is nonsingular. All entries in the matrix are nonnegative, // so no absolute values are needed. for (int r=0; r<M.Height(); ++r) { T row_sum = 0.0; for (int c=0; c<M.Width(); ++c) row_sum += M.Get(r, c); M.Set(r, r, row_sum + T(1)); } }
void IsoparametricTransformation::Transform (const DenseMatrix &matrix, DenseMatrix &result) { result.SetSize(PointMat.Height(), matrix.Width()); IntegrationPoint ip; Vector col; for (int j = 0; j < matrix.Width(); j++) { ip.x = matrix(0, j); if (matrix.Height() > 1) { ip.y = matrix(1, j); if (matrix.Height() > 2) { ip.z = matrix(2, j); } } result.GetColumnReference(j, col); Transform(ip, col); } }
bool HasNaNs(const DenseMatrix<T>& M) { // returns true if any matrix element is NaN int height = M.Height(); int width = M.Width(); for (int c=0; c<width; ++c) { for (int r=0; r<height; ++r) { T elt = M.Get(r, c); if (std::isnan(elt)) return true; } } return false; }
// helper to set submatrix of A repeated vdim times static void SetVDofSubMatrixTranspose(SparseMatrix& A, Array<int>& rows, Array<int>& cols, const DenseMatrix& subm, int vdim) { if (vdim == 1) { A.SetSubMatrixTranspose(rows, cols, subm, 1); } else { int nr = subm.Width(), nc = subm.Height(); for (int d = 0; d < vdim; d++) { Array<int> rows_sub(rows.GetData() + d*nr, nr); // (not owner) Array<int> cols_sub(cols.GetData() + d*nc, nc); // (not owner) A.SetSubMatrixTranspose(rows_sub, cols_sub, subm, 1); } } }
int LDLtUpdate (DenseMatrix & l, Vector & d, double a, const Vector & u) { // Bemerkung: Es wird a aus R erlaubt // Rueckgabewert: 0 .. D bleibt positiv definit // 1 .. sonst int i, j, n; n = l.Height(); Vector v(n); double t, told, xi; told = 1; v = u; for (j = 1; j <= n; j++) { t = told + a * sqr (v.Elem(j)) / d.Get(j); if (t <= 0) { (*testout) << "update err, t = " << t << endl; return 1; } xi = a * v.Elem(j) / (d.Get(j) * t); d.Elem(j) *= t / told; for (i = j + 1; i <= n; i++) { v.Elem(i) -= v.Elem(j) * l.Elem(i, j); l.Elem(i, j) += xi * v.Elem(i); } told = t; } return 0; }
void Overwrite(DenseMatrix<T>& A, const DenseMatrix<T>& B, const std::vector<unsigned int>& row_indices, const std::vector<unsigned int>& col_indices, const unsigned int num_rows, const unsigned int num_cols) { // Overwrite entries in A with entries in B. The row and column // index arrays contain the destination indices to be overwritten. // Matrix B has size num_rows x num_cols. if (num_rows > static_cast<unsigned int>(A.Height())) throw std::logic_error("Overwrite: row indices out of bounds"); if (num_cols > static_cast<unsigned int>(A.Width())) throw std::logic_error("Overwrite: col indices out of bounds"); const T* buf_b = B.LockedBuffer(); const unsigned int ldim_b = B.LDim(); T* buf_a = A.Buffer(); const unsigned int ldim_a = A.LDim(); for (unsigned int c=0; c<num_cols; ++c) { unsigned int col_a = col_indices[c]; unsigned int offset_a = ldim_a * col_a; unsigned int offset_b = ldim_b * c; for (unsigned int r=0; r<num_rows; ++r) { unsigned int row_a = row_indices[r]; //T val = B.Get(r, c); //A.Set(row_a, col_a, val); buf_a[offset_a + row_a] = buf_b[offset_b + r]; } } }
void ZeroizeSmallValues(DenseMatrix<T>& A, const T tol) { // set Aij to zero if |Aij| < tol T* buf = A.Buffer(); const unsigned int ldim = A.LDim(); const unsigned int height = A.Height(); const unsigned int width = A.Width(); OPENMP_PRAGMA(omp parallel for) for (unsigned int c=0; c<width; ++c) { unsigned int col_offset = c*ldim; for (unsigned int r=0; r<height; ++r) { T val = buf[col_offset + r]; if (std::abs(val) < tol) { buf[col_offset + r] = T(0); } } } }
bool NnlsHals(const MatrixType<T>& A, DenseMatrix<T>& W, DenseMatrix<T>& H, const T tol, const bool verbose, const unsigned int max_iter) { unsigned int n = A.Width(); unsigned int k = W.Width(); if (static_cast<unsigned int>(W.Height()) != static_cast<unsigned int>(A.Height())) throw std::logic_error("NnlsHals: W and A must have identical height"); if (static_cast<unsigned int>(H.Width()) != static_cast<unsigned int>(A.Width())) throw std::logic_error("NnlsHals: H and A must have identical width"); if (H.Height() != W.Width()) throw std::logic_error("NnlsHals: non-conformant W and H"); DenseMatrix<T> WtW(k, k), WtA(k, n), WtWH_r(1, n), gradH(k, n); if (verbose) std::cout << "\nRunning NNLS solver..." << std::endl; // compute W'W and W'A for the normal equations Gemm(TRANSPOSE, NORMAL, T(1.0), W, W, T(0.0), WtW); Gemm(TRANSPOSE, NORMAL, T(1.0), W, A, T(0.0), WtA); bool success = false; T pg0 = T(0), pg; for (unsigned int i=0; i<max_iter; ++i) { // compute the new matrix H UpdateH_Hals(H, WtWH_r, WtW, WtA); // compute gradH = WtW*H - WtA Gemm(NORMAL, NORMAL, T(1.0), WtW, H, T(0.0), gradH); Axpy( T(-1.0), WtA, gradH); // compute progress metric if (0 == i) { pg0 = ProjectedGradientNorm(gradH, H); if (verbose) ReportProgress(i+1, T(1.0)); continue; } else { pg = ProjectedGradientNorm(gradH, H); } if (verbose) ReportProgress(i+1, pg/pg0); // check progress vs. desired tolerance if (pg < tol * pg0) { success = true; NormalizeAndScale<T>(W, H); break; } } if (!success) std::cerr << "NNLS solver reached iteration limit." << std::endl; return success; }
MatrixConstantCoefficient(const DenseMatrix &m) : MatrixCoefficient(m.Height(), m.Width()), mat(m) { }
bool SystemSolveW(DenseMatrix<T>& X, // m x 2 DenseMatrix<T>& A, // 2 x 2 DenseMatrix<T>& B) // m x 2 { // Solve XA = B; use the code from the previous solver, but transpose // everything and change t to -t. const T eps = std::numeric_limits<double>::epsilon(); int m = B.Height(); if (std::abs(A.Get(0,0)) < eps && std::abs(A.Get(0,1)) < eps) { std::cerr << "SystemSolveW: singular matrix" << std::endl; return false; } T a2, b2, d2, e2, f2, x_1; T inv_a2, inv_d2; if (std::abs(A.Get(0,0)) >= std::abs(A.Get(0, 1))) { // use 'cosine' formulation; t is the tangent T t = A.Get(0, 1) / A.Get(0,0); a2 = A.Get(0,0) + t*A.Get(0,1); b2 = A.Get(1,0) + t*A.Get(1,1); d2 = A.Get(1,1) - t*A.Get(1,0); // precompute 1/a2 and 1/d2 to avoid repeated division inv_a2 = T(1.0) / a2; inv_d2 = T(1.0) / d2; // a2 is guaranteed to be positive if (std::abs(d2/a2) < eps) return false; // solve the upper triangular systems by backsubstitution for (int i=0; i<m; ++i) { e2 = B.Get(i,0) + t*B.Get(i,1); f2 = B.Get(i,1) - t*B.Get(i,0); x_1 = f2 * inv_d2; X.Set(i, 1, x_1); X.Set(i, 0, (e2 - b2*x_1)*inv_a2); } } else { // use 'sine' formulation; ct is the cotangent T ct = A.Get(0,0) / A.Get(0,1); a2 = -A.Get(0,1) - ct*A.Get(0,0); b2 = -A.Get(1,1) - ct*A.Get(1,0); d2 = A.Get(1,0) - ct*A.Get(1,1); // precompute 1/a2 and 1/d2 to avoid repeated division inv_a2 = T(1.0) / a2; inv_d2 = T(1.0) / d2; // a2 is guaranteed to be positive if (std::abs(d2/a2) < eps) return false; // solve the upper triangular systems by backsubstitution for (int i=0; i<m; ++i) { e2 = -B.Get(i,1) - ct*B.Get(i,0); f2 = B.Get(i,0) - ct*B.Get(i,1); x_1 = f2 * inv_d2; X.Set(i, 1, x_1); X.Set(i, 0, (e2 - b2*x_1) * inv_a2); } } return true; }
bool SolveNormalEqLeft(const DenseMatrix<T>& LHS, // kxk const DenseMatrix<T>& RHS, // mxk DenseMatrix<T>& X) // mxk { // Solve X * LHS = RHS for X, where LHS is symmetric positive-definite. if (LHS.Width() != LHS.Height()) throw std::logic_error("SolveNormalEqLeft: expected square matrix on LHS"); if ( (X.Width() != LHS.Height()) || (X.Height() != RHS.Height())) throw std::logic_error("SolveNormalEqLeft: non-conformant matrix X"); // // copy the input, since the solver overwrites it DenseMatrix<T> U(LHS); // Compute the Cholesky factor LHS = U'U bool success = true; try { Cholesky(UPPER, U); } catch (EL::NonHPSDMatrixException& e) { std::cerr << "Cholesky factorization failure - "; std::cerr << "matrix was not symmetric positive-definite." << std::endl; success = false; } catch (std::logic_error& e) { std::cerr << "Cholesky factorization failure - "; std::cerr << "matrix was not symmetric positive-definite." << std::endl; success = false; } if (!success) return false; // Solve X (U'U) = RHS as follows: // // Group the two left matrices: // // (XU') U = RHS // // Let Y = XU', so the equation becomes: // // YU = RHS // // Do a triangular solve: Y = (RHS) * inverse(U) X = RHS; Trsm(RIGHT, UPPER, NORMAL, NON_UNIT, T(1), U, X); // The solution Y is stored in X. // // Now solve XU' = Y, so X = Y * inverse(U') Trsm(RIGHT, UPPER, TRANSPOSE, NON_UNIT, T(1), U, X); // check // DenseMatrix<T> temp(m, k); // Gemm(NORMAL, NORMAL, T(1), X, LHS, T(0), temp); // Axpy(-1.0, RHS, temp); // double norm = Norm(temp, FROBENIUS_NORM); // std::cout << "\tSolveNormalEqLeft: norm = " << norm << std::endl; return true; }
bool NnlsBlockpivot(const DenseMatrix<T>& LHS, const DenseMatrix<T>& RHS, DenseMatrix<T>& X, // input as xinit DenseMatrix<T>& Y) // gradX { // Solve (LHS)*X = RHS for X by block principal pivoting. Matrix LHS // is assumed to be symmetric positive definite. const int PBAR = 3; const unsigned int WIDTH = RHS.Width(); const unsigned int HEIGHT = RHS.Height(); const unsigned int MAX_ITER = HEIGHT*5; BitMatrix passive_set = (X > T(0)); std::vector<unsigned int> tmp_indices(WIDTH); for (unsigned int i=0; i<WIDTH; ++i) tmp_indices[i] = i; MakeZeros(X); if (!BppSolveNormalEqNoGroup(tmp_indices, passive_set, LHS, RHS, X)) return false; // Y = LHS * X - RHS Gemm(NORMAL, NORMAL, T(1), LHS, X, T(0), Y); Axpy( T(-1), RHS, Y); std::vector<int> P(WIDTH, PBAR), Ninf(WIDTH, HEIGHT+1); BitMatrix nonopt_set = (Y < T(0)) & ~passive_set; BitMatrix infeas_set = (X < T(0)) & passive_set; std::vector<int> col_sums(WIDTH); std::vector<int> not_good(WIDTH); nonopt_set.SumColumns(not_good); infeas_set.SumColumns(col_sums); not_good += col_sums; BitMatrix not_opt_cols = (not_good > 0); BitMatrix not_opt_mask; std::vector<unsigned int> non_opt_col_indices(WIDTH); not_opt_cols.Find(non_opt_col_indices); DenseMatrix<double> RHSsub(HEIGHT, WIDTH); DenseMatrix<double> Xsub(HEIGHT, WIDTH); DenseMatrix<double> Ysub(HEIGHT, WIDTH); unsigned int iter = 0; while (!non_opt_col_indices.empty()) { // exit if not getting anywhere if (iter >= MAX_ITER) return false; UpdatePassiveSet(passive_set, PBAR, HEIGHT, not_opt_cols, nonopt_set, infeas_set, not_good, P, Ninf); // equivalent of repmat(NotOptCols, HEIGHT, 1) not_opt_mask = MatrixFromColumnMask(not_opt_cols, HEIGHT); // Setup for the normal equation solver by extracting submatrices // from RHS and X. The normal equation solver will extract further // subproblems from RHSsub and Xsub and write all updated values // back into RHSsub and Xsub. RHS.SubmatrixFromCols(RHSsub, non_opt_col_indices); X.SubmatrixFromCols(Xsub, non_opt_col_indices); if (!BppSolveNormalEqNoGroup(non_opt_col_indices, passive_set, LHS, RHSsub, Xsub)) return false; ZeroizeSmallValues(Xsub, 1.0e-12); // compute Ysub = LHS * Xsub - RHSsub Ysub.Resize(RHSsub.Height(), RHSsub.Width()); Gemm(NORMAL, NORMAL, T(1), LHS, Xsub, T(0), Ysub); Axpy( T(-1), RHSsub, Ysub); // update Y and X using the new values in Ysub and Xsub OverwriteCols(Y, Ysub, non_opt_col_indices, non_opt_col_indices.size()); OverwriteCols(X, Xsub, non_opt_col_indices, non_opt_col_indices.size()); ZeroizeSmallValues(X, 1.0e-12); ZeroizeSmallValues(Y, 1.0e-12); // Check optimality - BppUpdateSets does the equivalent of the next two lines. // nonopt_set = not_opt_mask & (Y < T(0)) & ~passive_set; // infeas_set = not_opt_mask & (X < T(0)) & passive_set; BppUpdateSets(nonopt_set, infeas_set, not_opt_mask, X, Y, passive_set); nonopt_set.SumColumns(not_good); infeas_set.SumColumns(col_sums); not_good += col_sums; not_opt_cols = (not_good > 0); not_opt_cols.Find(non_opt_col_indices); ++iter; } return true; }
void BppUpdateSets(BitMatrix& nonopt_set, BitMatrix& infeas_set, const BitMatrix& not_opt_mask, const DenseMatrix<T>& X, const DenseMatrix<T>& Y, const BitMatrix& passive_set) { // This function performs the equivalent of these operations: // // nonopt_set = not_opt_mask & (Y < T(0)) & ~passive_set; // infeas_set = not_opt_mask & (X < T(0)) & passive_set; const unsigned int height = not_opt_mask.Height(); const unsigned int width = not_opt_mask.Width(); if ( (static_cast<unsigned int>(X.Height()) != height) || (static_cast<unsigned int>(Y.Height()) != height) || (passive_set.Height() != height)) throw std::logic_error("BppUpdateSets: height mismatch"); if ( (static_cast<unsigned int>(X.Width()) != width) || (static_cast<unsigned int>(Y.Width()) != width) || (passive_set.Width() != width)) throw std::logic_error("BppUpdateSets: width mismatch"); nonopt_set.Resize(height, width); infeas_set.Resize(height, width); const unsigned int BITS = BitMatrix::BITS_PER_WORD; const unsigned int MASK = nonopt_set.Mask(); assert(infeas_set.Mask() == MASK); unsigned int* buf_r = nonopt_set.Buffer(); const unsigned int ldim_r = nonopt_set.LDim(); unsigned int* buf_i = infeas_set.Buffer(); const unsigned int ldim_i = infeas_set.LDim(); const unsigned int* buf_m = not_opt_mask.LockedBuffer(); const unsigned int ldim_m = not_opt_mask.LDim(); const T* buf_x = X.LockedBuffer(); const unsigned int ldim_x = X.LDim(); const T* buf_y = Y.LockedBuffer(); const unsigned int ldim_y = Y.LDim(); const unsigned int* buf_p = passive_set.LockedBuffer(); const unsigned int ldim_p = passive_set.LDim(); const unsigned int full_wds = height / BITS; const unsigned int extra = height - BITS*full_wds; assert(ldim_r >= ldim_m); assert(ldim_r >= ldim_p); OPENMP_PRAGMA(omp parallel for) for (unsigned int c=0; c<width; ++c) { unsigned int offset_r = c*ldim_r; unsigned int offset_i = c*ldim_i; unsigned int offset_m = c*ldim_m; unsigned int offset_x = c*ldim_x; unsigned int offset_y = c*ldim_y; unsigned int offset_p = c*ldim_p; unsigned int r_wd = 0, r=0; for (; r_wd<full_wds; ++r_wd) { unsigned int x_wd = 0, y_wd = 0; for (unsigned int q=0; q<BITS; ++q, ++r) { if (buf_x[offset_x + r] < T(0)) x_wd |= (1 << q); if (buf_y[offset_y + r] < T(0)) y_wd |= (1 << q); } buf_r[offset_r + r_wd] = buf_m[offset_m + r_wd] & y_wd & ~buf_p[offset_p + r_wd]; buf_i[offset_i + r_wd] = buf_m[offset_m + r_wd] & x_wd & buf_p[offset_p + r_wd]; } if (extra > 0) { unsigned int x_wd = 0, y_wd = 0; for (unsigned int q=0; q<extra; ++q, ++r) { if (buf_x[offset_x + r] < T(0)) x_wd |= (1 << q); if (buf_y[offset_y + r] < T(0)) y_wd |= (1 << q); } buf_r[offset_r + r_wd] = MASK & buf_m[offset_m + r_wd] & y_wd & ~buf_p[offset_p + r_wd]; buf_i[offset_i + r_wd] = MASK & buf_m[offset_m + r_wd] & x_wd & buf_p[offset_p + r_wd]; } } }