rci_t mzd_slice_echelonize_ple(mzd_slice_t *A, int full) { mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); rci_t r; if(full) { r = mzd_slice_pluq(A, P, Q); mzd_slice_t *U = mzd_slice_init_window(A, 0, 0, r, r); const rci_t r_radix = m4ri_radix*(r/m4ri_radix); if(r_radix == r && r!=A->ncols) { mzd_slice_t *B = mzd_slice_init_window(A, 0, r, r, A->ncols); for(rci_t i = 0; i < r; ++i) mzd_slice_write_elem(U, i, i, 1); mzd_slice_trsm_upper_left(U, B); mzd_slice_free_window(B); } else if (r_radix != r && r!=A->ncols) { assert(r_radix < r); if(A->ncols > r_radix+m4ri_radix) { mzd_slice_t *B0 = mzd_slice_submatrix(NULL, A, 0, r_radix, r, r_radix+m4ri_radix); mzd_slice_t *B0w = mzd_slice_init_window( A, 0, r_radix, r, r_radix+m4ri_radix); mzd_slice_t *B1 = mzd_slice_init_window(A, 0, r_radix+m4ri_radix, r, A->ncols); for(rci_t i = 0; i < r; ++i) mzd_slice_write_elem(U, i, i, 1); mzd_slice_trsm_upper_left(U, B0); mzd_slice_trsm_upper_left(U, B1); mzd_slice_copy(B0w, B0); mzd_slice_free(B0); mzd_slice_free_window(B0w); mzd_slice_free_window(B1); } else { mzd_slice_t *B = mzd_slice_submatrix(NULL, A, 0, r_radix, r, A->ncols); mzd_slice_t *Bw = mzd_slice_init_window(A, 0, r_radix, r, A->ncols); for(rci_t i = 0; i < r; ++i) mzd_slice_write_elem(U, i, i, 1); mzd_slice_trsm_upper_left(U, B); mzd_slice_copy(Bw, B); mzd_slice_free_window(Bw); mzd_slice_free(B); } } mzd_slice_set_ui(U, 1); mzd_slice_free_window(U); if(r) { mzd_slice_t *A0 = mzd_slice_init_window(A, 0, 0, r, A->ncols); mzd_slice_apply_p_right(A0, Q); mzd_slice_free_window(A0); } } else { r = mzd_slice_ple(A, P, Q); for(rci_t i = 0; i < r; ++i) { for(int e=0; e < A->depth; e++) { for(rci_t j = 0; j <= i; j++) { int const length = MIN(m4ri_radix, i - j + 1); mzd_clear_bits(A->x[e], i, j, length); } } mzd_slice_write_elem(A, i, Q->values[i], 1); } } if(r != A->nrows) { mzd_slice_t *R = mzd_slice_init_window(A, r, 0, A->nrows, A->ncols); mzd_slice_set_ui(R, 0); mzd_slice_free_window(R); } mzp_free(P); mzp_free(Q); return r; }
int run(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - i); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } mzp_t *P = mzp_init(A->nrows); mzp_t *Q = mzp_init(A->ncols); #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if (papi_res) m4ri_die(""); #endif if(strcmp(p->algorithm, "m4ri") == 0) p->r = mzd_echelonize_m4ri(A, 0, 0); else if(strcmp(p->algorithm, "ple") == 0) p->r = mzd_ple(A, P, Q, 0); else if(strcmp(p->algorithm, "mmpf") == 0) p->r = _mzd_ple_russian(A, P, Q, 0); else m4ri_die("unknown algorithm %s",p->algorithm); #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else mzp_free(P); mzp_free(Q); PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { data[nv] -= loop_calibration[nv]; } #endif mzd_free(A); return 0; }
int _mzd_pluq_solve_left(mzd_t const *A, rci_t rank, mzp_t const *P, mzp_t const *Q, mzd_t *B, int const cutoff, int const inconsistency_check) { /** A is supposed to store L lower triangular and U upper triangular * B is modified in place * (Bi's in the comments are just modified versions of B) * PLUQ = A * 1) P B2 = B1 * 2) L B3 = B2 * 3) U B4 = B3 * 4) Q B5 = B4 */ int retval = 0; /* P B2 = B1 or B2 = P^T B1 */ mzd_apply_p_left(B, P); /* L B3 = B2 */ /* view on the upper part of L */ mzd_t const *LU = mzd_init_window_const(A, 0, 0, rank, rank); mzd_t *Y1 = mzd_init_window(B, 0, 0, rank, B->ncols); mzd_trsm_lower_left(LU, Y1, cutoff); if (inconsistency_check) { /* Check for inconsistency */ /** FASTER without this check; update with the lower part of L */ mzd_t const *H = mzd_init_window_const(A, rank, 0, A->nrows, rank); mzd_t *Y2 = mzd_init_window(B, rank, 0, A->nrows, B->ncols); if(A->nrows < B->nrows) { mzd_t *Y3 = mzd_init_window(B, A->nrows, 0, B->nrows, B->ncols); mzd_set_ui(Y3, 0); mzd_free_window(Y3); } mzd_addmul(Y2, H, Y1, cutoff); /* * test whether Y2 is the zero matrix */ if(!mzd_is_zero(Y2)) { retval = -1; } mzd_free_window((mzd_t*)H); mzd_free_window(Y2); } /* U B4 = B3 */ mzd_trsm_upper_left(LU, Y1, cutoff); mzd_free_window((mzd_t*)LU); mzd_free_window(Y1); if (!inconsistency_check) { /** Default is to set the undefined bits to zero if inconsistency * has been checked then Y2 bits are already all zeroes thus this * clearing is not needed */ for(rci_t i = rank; i < B->nrows; ++i) { for(rci_t j = 0; j < B->ncols; j += m4ri_radix) { mzd_clear_bits(B, i, j, MIN(m4ri_radix, B->ncols - j)); } } } /* Q B5 = B4 or B5 = Q^T B4 */ mzd_apply_p_left_trans(B, Q); /* P L U Q B5 = B1 */ __M4RI_DD_MZD(B); __M4RI_DD_INT(retval); return retval; }
int run_nothing(void *_p, unsigned long long *data, int *data_len) { struct elim_params *p = (struct elim_params *)_p; mzd_t *A = mzd_init(p->m, p->n); if(p->r != 0) { mzd_t *L, *U; L = mzd_init(p->m, p->m); U = mzd_init(p->m, p->n); mzd_randomize(U); mzd_randomize(L); for (rci_t i = 0; i < p->m; ++i) { for (rci_t j = i + 1; j < p->m; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->m - j); mzd_clear_bits(L, i, j, length); } mzd_write_bit(L,i,i, 1); for (rci_t j = 0; j < i && j <p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, i - j); mzd_clear_bits(U, i, j, length); } if(i < p->r) { mzd_write_bit(U, i, i, 1); } else { for (rci_t j = i; j < p->n; j+=m4ri_radix) { int const length = MIN(m4ri_radix, p->n - j); mzd_clear_bits(U, i, j, length); } } } mzd_mul(A,L,U,0); mzd_free(L); mzd_free(U); } else { mzd_randomize(A); } #ifndef HAVE_LIBPAPI *data_len = 2; #else *data_len = MIN(papi_array_len + 1, *data_len); #endif int papi_res; #ifndef HAVE_LIBPAPI data[0] = walltime(0); data[1] = cpucycles(); #else int array_len = *data_len - 1; unsigned long long t0 = PAPI_get_virt_usec(); papi_res = PAPI_start_counters((int*)papi_events, array_len); if(papi_res) m4ri_die(""); #endif #ifndef HAVE_LIBPAPI data[1] = cpucycles() - data[1]; data[0] = walltime(data[0]); #else PAPI_stop_counters((long long*)&data[1], array_len); t0 = PAPI_get_virt_usec() - t0; data[0] = t0; for (int nv = 0; nv <= array_len; ++nv) { if (data[nv] < loop_calibration[nv]) loop_calibration[nv] = data[nv]; } #endif mzd_free(A); return (0); }