void queue(unsigned char j) { #define GIJ gi[j] #define GJI g[j][GIJ] #define GJI2 g[j][GIJ+1] if(gi[j]<GSL && (g[j][GIJ]|| g[j][GIJ+1])){ //g[j][GIJ] || g[j][GIJ2])){ if(GJI>GJI2) dec(j); else if(GJI<GJI2) inc(j); else ;//do nothing; aq(cur[j],j); } }
void bi(ag bj) { aq(bj, this->bd.bc); }
static void tst1() { unsynch_mpq_manager nm; polynomial::manager m(nm); polynomial_ref x(m); x = m.mk_polynomial(m.mk_var()); polynomial_ref p(m); p = 3*x - 2; algebraic_numbers::manager am(nm); scoped_anum_vector rs1(am); std::cout << "p: " << p << "\n"; am.isolate_roots(p, rs1); display_anums(std::cout, rs1); SASSERT(rs1.size() == 1); std::cout.flush(); p = (x^2) - 2; std::cout << "p: " << p << "\n"; rs1.reset(); am.isolate_roots(p, rs1); display_anums(std::cout, rs1); SASSERT(rs1.size() == 2); scoped_anum sqrt2(am); am.set(sqrt2, rs1[1]); scoped_mpq q(nm); nm.set(q, 1, 3); scoped_anum aq(am); am.set(aq, q); // create algebraic number representing 1/3 am.add(sqrt2, aq, aq); std::cout << "sqrt(2) + 1/3: "; am.display_decimal(std::cout, aq, 10); std::cout << " "; am.display_interval(std::cout, aq); std::cout << " "; am.display_root(std::cout, aq); std::cout << "\n"; am.set(aq, q); am.add(rs1[0], aq, aq); std::cout << "-sqrt(2) + 1/3: "; am.display_decimal(std::cout, aq, 10); std::cout << " "; am.display_interval(std::cout, aq); std::cout << " "; am.display_root(std::cout, aq); std::cout << "\n"; p = ((x^5) - x - 1)*(x-1)*(x-2); std::cout << "p: " << p << "\n"; rs1.reset(); am.isolate_roots(p, rs1); display_anums(std::cout, rs1); SASSERT(rs1.size() == 3); scoped_anum gauss(am); am.set(gauss, rs1[1]); std::cout << "compare(" << sqrt2 << ", " << gauss << "): " << am.compare(sqrt2, gauss) << "\n"; statistics st; am.collect_statistics(st); st.display_smt2(std::cout); p = ((x^2) - 2)*((x^2) - 3); std::cout << "p: " << p << "\n"; rs1.reset(); am.isolate_roots(p, rs1); display_anums(std::cout, rs1); SASSERT(rs1.size() == 4); scoped_anum hidden_sqrt2(am); am.set(hidden_sqrt2, rs1[2]); std::cout << "compare(" << sqrt2 << ", " << hidden_sqrt2 << "): " << am.compare(sqrt2, hidden_sqrt2) << "\n"; st.reset(); am.collect_statistics(st); st.display_smt2(std::cout); std::cout << "sqrt(2)^4: " << (sqrt2^4) << "\n"; SASSERT(is_int(power(sqrt2, 4))); SASSERT(power(sqrt2, 4) == 4); scoped_anum sqrt2_gauss(am); am.add(sqrt2, gauss, sqrt2_gauss); std::cout << "sqrt2 + gauss: " << sqrt2_gauss << " "; am.display_root(std::cout, sqrt2_gauss); std::cout << "\n"; std::cout << "sqrt2*sqrt2: " << sqrt2*sqrt2 << "\n"; std::cout << "sqrt2*sqrt2 == 2: " << (sqrt2*sqrt2 == 2) << std::endl; scoped_anum three(am); am.set(three, -3); std::cout << "(-3)^(1/5): " << root(three, 5) << "\n"; std::cout << "sqrt(2)^(1/3): " << root(sqrt2, 3) << "\n"; std::cout << "as-root-object(sqrt(2)^(1/3)): " << root_obj_pp(root(sqrt2, 3)) << "\n"; std::cout << "(sqrt(2) + 1)^(1/3): " << root(sqrt2 + 1, 3) << "\n"; std::cout << "as-root-object((sqrt(2) + 1)^(1/3)): " << root_obj_pp(root(sqrt2 + 1, 3)) << "\n"; std::cout << "(sqrt(2) + gauss)^(1/5): " << root(sqrt2 + gauss, 5) << "\n"; std::cout << "as-root-object(sqrt(2) + gauss)^(1/5): " << root_obj_pp(root(sqrt2 + gauss, 5)) << "\n"; std::cout << "(sqrt(2) / sqrt(2)): " << sqrt2 / hidden_sqrt2 << "\n"; std::cout << "(sqrt(2) / gauss): " << sqrt2 / gauss << "\n"; std::cout << "(sqrt(2) / gauss) 30 digits: " << decimal_pp(sqrt2 / gauss, 30) << "\n"; std::cout << "as-root-object(sqrt(2) / gauss): " << root_obj_pp(sqrt2 / gauss) << "\n"; std::cout << "is_int(sqrt(2)^(1/3)): " << am.is_int(root(sqrt2, 3)) << "\n"; scoped_anum tmp(am); scoped_anum four(am); am.set(four, 4); am.set(tmp, sqrt2); am.inv(tmp); std::cout << "1/sqrt(2): " << tmp << "\n"; am.mul(tmp, four, tmp); std::cout << "4*1/sqrt(2): " << tmp << " " << root_obj_pp(tmp) << "\n"; am.mul(tmp, sqrt2, tmp); std::cout << "sqrt(2)*4*(1/sqrt2): " << tmp << " " << root_obj_pp(tmp) << "\n"; std::cout << "is_int(sqrt(2)*4*(1/sqrt2)): " << am.is_int(tmp) << ", after is-int: " << tmp << "\n"; p = (998*x - 1414)*((x^2) - 15); std::cout << "p: " << p << "\n"; rs1.reset(); am.isolate_roots(p, rs1); std::cout << "is-rational(sqrt2): " << am.is_rational(sqrt2) << "\n"; scoped_anum qr(am); am.set(qr, rs1[1]); std::cout << "qr: " << root_obj_pp(qr); std::cout << ", is-rational: " << am.is_rational(qr) << ", val: " << root_obj_pp(qr) << "\n"; return; std::cout << "compare(" << sqrt2 << ", " << gauss << "): " << am.compare(sqrt2, gauss) << "\n"; p = (x^16) - 136*(x^14) + 6476*(x^12) - 141912*(x^10) + 1513334*(x^8) - 7453176*(x^6) + 13950764*(x^4) - 5596840*(x^2) + 46225; std::cout << "p: " << p << "\n"; rs1.reset(); am.isolate_roots(p, rs1); display_anums(std::cout, rs1); }
void cbranch () { int tid; pthread_mutex_lock(&mMutRec); tid = mThreadId ++; mTids[tid] = tid; pthread_setspecific(mTidKey, (void*)(mTids + tid)); MMRegistry::registerMemManager(mManagers[tid]); pthread_mutex_unlock(&mMutRec); SmartArrayPtr < Set > aq(mMaxLocalQueueSize); FixedVector < Set > ltq((Set*)aq, mMaxLocalQueueSize); Solution asolv[2]; FixedVector < Solution > solv(asolv, 2); SmartArrayPtr < Set > alsetv(mMaxLocalSetBufferSize); FixedVector < Set > lsetv(alsetv, mMaxLocalSetBufferSize); SmartArrayPtr < Solution > alsolv(mMaxLocalSolutionBufferSize); FixedVector < Solution > lsolv(alsolv, mMaxLocalSolutionBufferSize); for(int step = 1; ; step ++) { Set s; if(ltq.empty()) { pthread_mutex_lock(&mMutTaskQueue); mStarv ++; mSteps = BNBMAX(mSteps, step); while(mTaskQueue.empty() && (mStarv != mNumThreads) && (mSteps < mLocalSteps)) { mLocalCounters[tid].mStarv ++; struct timeval tv; double t1, t2; gettimeofday(&tv, NULL); t1 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001; pthread_cond_wait(&mCV, &mMutTaskQueue); gettimeofday(&tv, NULL); t2 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001; mLocalCounters[tid].mStarvTime += (t2 - t1); } if(mSteps >= mLocalSteps) { pthread_cond_broadcast(&mCV); pthread_mutex_unlock(&mMutTaskQueue); break; } else if(!mTaskQueue.empty()) { s = mTaskQueue.top (); mTaskQueue.pop (); mStarv --; mLocalCounters[tid].mGet ++; pthread_mutex_unlock(&mMutTaskQueue); } else { pthread_cond_broadcast(&mCV); pthread_mutex_unlock(&mMutTaskQueue); break; } } else { s = ltq.back(); ltq.pop_back(); } if (!mSetFactory->discard (s, getRecord())){ mSetFactory->branch (s, lsetv, lsolv, getRecord(), mInfos + tid, ltq.size()); typename ProblemFactory::ValueType rec = getRecord(); while(!lsolv.empty()) { Solution s = lsolv.back(); lsolv.pop_back(); if(((Factory::getProblemType() == BNB_MAXIMIZE) && (s.getValue() > rec)) || ((Factory::getProblemType() == BNB_MINIMIZE) && (s.getValue() < rec))) { rec = s.getValue(); if(!solv.empty()) solv.pop_back(); solv.push_back(s); } } updateRecord(rec); while(!lsetv.empty()) { Set s = lsetv.back(); lsetv.pop_back(); if(!mSetFactory->discard (s, rec)) ltq.push_back(s); else mInfos[tid].mDiscardedByRecord ++; } mSteps = BNBMAX(mSteps, step); if(mSteps >= mLocalSteps) { pthread_mutex_lock(&mMutTaskQueue); if(mStarv) pthread_cond_broadcast(&mCV); pthread_mutex_unlock(&mMutTaskQueue); break; } if((step % mUpdateRatio) == 0) { if(!ltq.empty()) { pthread_mutex_lock(&mMutTaskQueue); struct timeval tv; double t1, t2; gettimeofday(&tv, NULL); t1 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001; mLocalCounters[tid].mDonat ++; for(int i = 0; i < mPutChunk; i ++) { if(!ltq.empty()) { MMRegistry::registerMemManager(mAuxMemManager); Set s = ltq.back(); mTaskQueue.push(s); MMRegistry::registerMemManager(mManagers[tid]); ltq.pop_back(); mLocalCounters[tid].mPut ++; } else break; } mQLen = mTaskQueue.size(); mMaxQLen = BNBMAX(mQLen, mMaxQLen); if(mStarv) pthread_cond_broadcast(&mCV); t2 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001; mLocalCounters[tid].mDonatTime += t2 - t1; pthread_mutex_unlock(&mMutTaskQueue); } } } else { mInfos[tid].mDiscardedByRecord ++; } } pthread_mutex_lock(&mMutTaskQueue); while(!ltq.empty()) { Set s = ltq.back(); ltq.pop_back(); mTaskQueue.push(s); } pushSolutions (solv, mInfos + tid); pthread_mutex_unlock(&mMutTaskQueue); }
TYPED_TEST(QuantBlasTest, TestGemmComparativeFloatQuant) { typedef typename TypeParam::Dtype Dtype; // Expect at most 5% error float percentile_eps = 0.05; std::random_device rdev; std::mt19937 rngen(rdev()); // Need to test > 64 dimension std::uniform_int_distribution<int_tp> dimsRand(1, 256); std::uniform_int_distribution<int_tp> boolRand(0, 1); std::uniform_int_distribution<int_tp> factorRand(-25, 25); std::uniform_real_distribution<float> valRand(-2.0, 2.0); for (int_tp testIdx = 0; testIdx < 25; ++testIdx) { int_tp M = dimsRand(rngen); int_tp N = dimsRand(rngen); int_tp K = dimsRand(rngen); CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans; CBLAS_TRANSPOSE trans_B = boolRand(rngen) ? CblasTrans : CblasNoTrans; bool has_alpha = boolRand(rngen); bool has_beta = has_alpha ? boolRand(rngen) : true; bool alpha_with_quant = boolRand(rngen) && has_alpha; bool beta_with_quant = boolRand(rngen) && has_beta; float alpha_val; float beta_val; if (has_alpha) { alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0); } else { alpha_val = 0.0; } if (has_beta) { beta_val = beta_with_quant ? valRand(rngen) : float(1.0); } else { beta_val = 0.0; } vector<int_tp> A_shape(4, 1); vector<int_tp> B_shape(4, 1); vector<int_tp> C_shape(4, 1); A_shape[2] = M; A_shape[3] = K; B_shape[2] = K; B_shape[3] = N; C_shape[2] = M; C_shape[3] = N; Blob<float> A(A_shape, Caffe::GetDefaultDevice()); Blob<float> B(B_shape, Caffe::GetDefaultDevice()); Blob<float> C(C_shape, Caffe::GetDefaultDevice()); Blob<float> C_result(C_shape, Caffe::GetDefaultDevice()); Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice()); Blob<Dtype> B_quant(B_shape, Caffe::GetDefaultDevice()); Blob<Dtype> C_quant(C_shape, Caffe::GetDefaultDevice()); Blob<float> C_unquant(C_shape, Caffe::GetDefaultDevice()); caffe_rng_gaussian(M * K, (float)0.0, (float)0.5, A.mutable_cpu_data()); caffe_rng_gaussian(K * N, (float)0.0, (float)0.5, B.mutable_cpu_data()); caffe_rng_gaussian(M * N, (float)0.0, (float)0.5, C.mutable_cpu_data()); caffe_copy(M * N, C.cpu_data(), C_result.mutable_cpu_data()); QuantizerParameter qpm_a; QuantizerParameter qpm_b; QuantizerParameter qpm_c; QuantizerParameter qpm_alpha; QuantizerParameter qpm_beta; qpm_a.set_mode(CAFFE_QUANT_OBSERVE); qpm_b.set_mode(CAFFE_QUANT_OBSERVE); qpm_c.set_mode(CAFFE_QUANT_OBSERVE); qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE); qpm_beta.set_mode(CAFFE_QUANT_OBSERVE); Quantizer<float, Dtype> aq(qpm_a); Quantizer<float, Dtype> bq(qpm_b); Quantizer<float, Dtype> cq(qpm_c); Quantizer<float, Dtype> alphaq(qpm_alpha); Quantizer<float, Dtype> betaq(qpm_beta); // Normal GEMM caffe_gemm<float>( trans_A, trans_B, M, N, K, alpha_val, A.cpu_data(), B.cpu_data(), beta_val, C_result.mutable_cpu_data()); // Observe all values that will be relevant for quantization aq.ObserveIn_cpu(M * K, A.cpu_data()); bq.ObserveIn_cpu(K * N, B.cpu_data()); cq.ObserveIn_cpu(M * N, C.cpu_data()); cq.ObserveIn_cpu(M * N, C_result.cpu_data()); alphaq.ObserveIn_cpu(1, &alpha_val); betaq.ObserveIn_cpu(1, &beta_val); // Apply observed values to the quantizer aq.update(); bq.update(); cq.update(); alphaq.update(); betaq.update(); // Quantize A, B and C aq.Forward_cpu(M * K, A.cpu_data(), A_quant.mutable_cpu_data()); bq.Forward_cpu(K * N, B.cpu_data(), B_quant.mutable_cpu_data()); cq.Forward_cpu(M * N, C.cpu_data(), C_quant.mutable_cpu_data()); Dtype alpha_val_quant = has_alpha; Dtype beta_val_quant = has_beta; // Quantize alpha if (alpha_with_quant) { alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant); } // Quantize beta if (beta_with_quant) { betaq.Forward_cpu(1, &beta_val, &beta_val_quant); } /* std::cout << "C max:" << cq.in_quantizer_values().max << std::endl; std::cout << "C min:" << cq.in_quantizer_values().min << std::endl; std::cout << "C zero:" << cq.in_quantizer_values().zero << std::endl; std::cout << "C scale:" << cq.in_quantizer_values().scale << std::endl; std::cout << "C max:" << cq.out_quantizer_values().max << std::endl; std::cout << "C min:" << cq.out_quantizer_values().min << std::endl; std::cout << "C zero:" << cq.out_quantizer_values().zero << std::endl; std::cout << "C scale:" << cq.out_quantizer_values().scale << std::endl; */ if (Caffe::mode() == Caffe::Brew::CPU) { caffe_gemm<Dtype>( trans_A, trans_B, M, N, K, alpha_val_quant, A_quant.cpu_data(), B_quant.cpu_data(), beta_val_quant, C_quant.mutable_cpu_data(), alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr, &(aq.out_quantizer_values()), &(bq.out_quantizer_values()), beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr, &(cq.out_quantizer_values())); } else { Caffe::GetDefaultDevice()->template gemm<Dtype>(trans_A, trans_B, M, N, K, alpha_val_quant, A_quant.gpu_data(), B_quant.gpu_data(), beta_val_quant, C_quant.mutable_gpu_data(), alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr, &(aq.out_quantizer_values()), &(bq.out_quantizer_values()), beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr, &(cq.out_quantizer_values())); } cq.Backward_cpu(M * N, C_quant.cpu_data(), C_unquant.mutable_cpu_data()); // print_matrix(A_quant.cpu_data(), M, K); // print_matrix(B_quant.cpu_data(), K, N); // print_matrix(C_quant.cpu_data(), M, N); // print_matrix(C_result.cpu_data(), M, N); // print_matrix(C_unquant.cpu_data(), M, N); const QuantizerValues cqv = cq.in_quantizer_values(); float eps = std::max(std::abs(cqv.get_max<float>()), std::abs(cqv.get_min<float>())) * percentile_eps; for (int_tp i = 0; i < M * N; ++i) { EXPECT_NEAR(C_unquant.cpu_data()[i], C_result.cpu_data()[i], eps); // One error is enough to abort if (fabs(C_unquant.cpu_data()[i] - C_result.cpu_data()[i]) >= eps) { break; } } } }
TYPED_TEST(QuantBlasTest, TestGemvComparativeFloatQuant) { typedef typename TypeParam::Dtype Dtype; // Expect at most 5% error float percentile_eps = 0.05; std::random_device rdev; std::mt19937 rngen(rdev()); // Need to test > 64 dimension std::uniform_int_distribution<int_tp> dimsRand(1, 256); std::uniform_int_distribution<int_tp> boolRand(0, 1); std::uniform_int_distribution<int_tp> factorRand(-25, 25); std::uniform_real_distribution<float> valRand(-2.0, 2.0); for (int_tp testIdx = 0; testIdx < 25; ++testIdx) { int_tp M = dimsRand(rngen); int_tp N = dimsRand(rngen); CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans; bool has_alpha = boolRand(rngen); bool has_beta = has_alpha ? boolRand(rngen) : true; bool alpha_with_quant = boolRand(rngen) && has_alpha; bool beta_with_quant = boolRand(rngen) && has_beta; float alpha_val; float beta_val; if (has_alpha) { alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0); } else { alpha_val = 0.0; } if (has_beta) { beta_val = beta_with_quant ? valRand(rngen) : float(1.0); } else { beta_val = 0.0; } vector<int_tp> A_shape(4, 1); vector<int_tp> x_shape(4, 1); vector<int_tp> y_shape(4, 1); A_shape[2] = M; A_shape[3] = N; x_shape[3] = trans_A == CblasTrans ? M : N; y_shape[3] = trans_A == CblasTrans ? N : M; Blob<float> A(A_shape, Caffe::GetDefaultDevice()); Blob<float> x(x_shape, Caffe::GetDefaultDevice()); Blob<float> y(y_shape, Caffe::GetDefaultDevice()); Blob<float> y_result(y_shape, Caffe::GetDefaultDevice()); Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice()); Blob<Dtype> x_quant(x_shape, Caffe::GetDefaultDevice()); Blob<Dtype> y_quant(y_shape, Caffe::GetDefaultDevice()); Blob<float> y_unquant(y_shape, Caffe::GetDefaultDevice()); caffe_rng_gaussian(M * N, (float)0.0, (float)0.5, A.mutable_cpu_data()); caffe_rng_gaussian(trans_A == CblasTrans ? M : N, (float)0.0, (float)0.5, x.mutable_cpu_data()); caffe_rng_gaussian(trans_A == CblasTrans ? N : M, (float)0.0, (float)0.5, y.mutable_cpu_data()); caffe_copy(trans_A == CblasTrans ? N : M, y.cpu_data(), y_result.mutable_cpu_data()); QuantizerParameter qpm_a; QuantizerParameter qpm_x; QuantizerParameter qpm_y; QuantizerParameter qpm_alpha; QuantizerParameter qpm_beta; qpm_a.set_mode(CAFFE_QUANT_OBSERVE); qpm_x.set_mode(CAFFE_QUANT_OBSERVE); qpm_y.set_mode(CAFFE_QUANT_OBSERVE); qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE); qpm_beta.set_mode(CAFFE_QUANT_OBSERVE); Quantizer<float, Dtype> aq(qpm_a); Quantizer<float, Dtype> xq(qpm_x); Quantizer<float, Dtype> yq(qpm_y); Quantizer<float, Dtype> alphaq(qpm_alpha); Quantizer<float, Dtype> betaq(qpm_beta); // Normal GEMM caffe_gemv<float>( trans_A, M, N, alpha_val, A.cpu_data(), x.cpu_data(), beta_val, y_result.mutable_cpu_data()); // Observe all values that will be relevant for quantization aq.ObserveIn_cpu(M * N, A.cpu_data()); xq.ObserveIn_cpu(trans_A == CblasTrans ? M : N, x.cpu_data()); yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y.cpu_data()); yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y_result.cpu_data()); alphaq.ObserveIn_cpu(1, &alpha_val); betaq.ObserveIn_cpu(1, &beta_val); // Apply observed values to the quantizer aq.update(); xq.update(); yq.update(); alphaq.update(); betaq.update(); // Quantize A, B and C aq.Forward_cpu(M * N, A.cpu_data(), A_quant.mutable_cpu_data()); xq.Forward_cpu(trans_A == CblasTrans ? M : N, x.cpu_data(), x_quant.mutable_cpu_data()); yq.Forward_cpu(trans_A == CblasTrans ? N : M, y.cpu_data(), y_quant.mutable_cpu_data()); Dtype alpha_val_quant = has_alpha; Dtype beta_val_quant = has_beta; // Quantize alpha if (alpha_with_quant) { alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant); } // Quantize beta if (beta_with_quant) { betaq.Forward_cpu(1, &beta_val, &beta_val_quant); } if (Caffe::mode() == Caffe::Brew::CPU) { caffe_gemv<Dtype>(trans_A, M, N, alpha_val_quant, A_quant.cpu_data(), x_quant.cpu_data(), beta_val_quant, y_quant.mutable_cpu_data(), alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr, &(aq.out_quantizer_values()), &(xq.out_quantizer_values()), beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr, &(yq.out_quantizer_values())); } else { Caffe::GetDefaultDevice()->template gemv<Dtype>(trans_A, M, N, alpha_val_quant, A_quant.gpu_data(), x_quant.gpu_data(), beta_val_quant, y_quant.mutable_gpu_data(), alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr, &(aq.out_quantizer_values()), &(xq.out_quantizer_values()), beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr, &(yq.out_quantizer_values())); } yq.Backward_cpu(trans_A == CblasTrans ? N : M, y_quant.cpu_data(), y_unquant.mutable_cpu_data()); // print_matrix(A_quant.cpu_data(), M, K); // print_matrix(B_quant.cpu_data(), K, N); // print_matrix(C_quant.cpu_data(), M, N); // print_matrix(C_result.cpu_data(), M, N); // print_matrix(C_unquant.cpu_data(), M, N); const QuantizerValues cqv = yq.in_quantizer_values(); float eps = std::max(std::abs(cqv.get_max<float>()), std::abs(cqv.get_min<float>())) * percentile_eps; for (int_tp i = 0; i < (trans_A == CblasTrans ? N : M); ++i) { EXPECT_NEAR(y_unquant.cpu_data()[i], y_result.cpu_data()[i], eps); // One error is enough to abort if (fabs(y_unquant.cpu_data()[i] - y_result.cpu_data()[i]) >= eps) { break; } } } }