コード例 #1
0
ファイル: p.c プロジェクト: duhuanpeng/au-phone-hacking
void queue(unsigned char j)
{
 #define GIJ gi[j]
 #define GJI g[j][GIJ]
 #define GJI2 g[j][GIJ+1]

 if(gi[j]<GSL && (g[j][GIJ]|| g[j][GIJ+1])){
 //g[j][GIJ] || g[j][GIJ2])){
  if(GJI>GJI2)
   dec(j);
  else if(GJI<GJI2)
   inc(j);
  else
   ;//do nothing;
  aq(cur[j],j);
 }
}
コード例 #2
0
 void bi(ag bj) { aq(bj, this->bd.bc); }
コード例 #3
0
ファイル: algebraic.cpp プロジェクト: jackluo923/juxta
static void tst1() {
    unsynch_mpq_manager nm;
    polynomial::manager m(nm);
    polynomial_ref x(m);
    x = m.mk_polynomial(m.mk_var());
    polynomial_ref p(m);
    p = 3*x - 2;

    algebraic_numbers::manager am(nm);
    scoped_anum_vector rs1(am);
    std::cout << "p: " << p << "\n";
    am.isolate_roots(p, rs1);
    display_anums(std::cout, rs1);
    SASSERT(rs1.size() == 1);
    std::cout.flush();

    p = (x^2) - 2;
    std::cout << "p: " << p << "\n";
    rs1.reset();
    am.isolate_roots(p, rs1);
    display_anums(std::cout, rs1);
    SASSERT(rs1.size() == 2);

    scoped_anum sqrt2(am);
    am.set(sqrt2, rs1[1]);

    scoped_mpq  q(nm);
    nm.set(q, 1, 3);
    scoped_anum aq(am);
    am.set(aq, q); // create algebraic number representing 1/3

    am.add(sqrt2, aq, aq);
    std::cout << "sqrt(2) + 1/3: ";
    am.display_decimal(std::cout, aq, 10); std::cout << " "; am.display_interval(std::cout, aq);
    std::cout << " "; am.display_root(std::cout, aq); std::cout << "\n";

    am.set(aq, q);
    am.add(rs1[0], aq, aq);
    std::cout << "-sqrt(2) + 1/3: ";
    am.display_decimal(std::cout, aq, 10); std::cout << " "; am.display_interval(std::cout, aq);
    std::cout << " "; am.display_root(std::cout, aq); std::cout << "\n";

    p = ((x^5) - x - 1)*(x-1)*(x-2);
    std::cout << "p: " << p << "\n";
    rs1.reset();
    am.isolate_roots(p, rs1);
    display_anums(std::cout, rs1);
    SASSERT(rs1.size() == 3);

    scoped_anum gauss(am);
    am.set(gauss, rs1[1]);

    std::cout << "compare(" << sqrt2 << ", " << gauss << "): " << am.compare(sqrt2, gauss) << "\n";

    statistics st;
    am.collect_statistics(st);
    st.display_smt2(std::cout);

    p = ((x^2) - 2)*((x^2) - 3);
    std::cout << "p: " << p << "\n";
    rs1.reset();
    am.isolate_roots(p, rs1);
    display_anums(std::cout, rs1);
    SASSERT(rs1.size() == 4);

    scoped_anum hidden_sqrt2(am);
    am.set(hidden_sqrt2, rs1[2]);

    std::cout << "compare(" << sqrt2 << ", " << hidden_sqrt2 << "): " << am.compare(sqrt2, hidden_sqrt2) << "\n";
    st.reset();
    am.collect_statistics(st);
    st.display_smt2(std::cout);

    std::cout << "sqrt(2)^4: " << (sqrt2^4) << "\n";

    SASSERT(is_int(power(sqrt2, 4)));
    SASSERT(power(sqrt2, 4) == 4);

    scoped_anum sqrt2_gauss(am);
    am.add(sqrt2, gauss, sqrt2_gauss);
    std::cout << "sqrt2 + gauss: " << sqrt2_gauss << " "; am.display_root(std::cout, sqrt2_gauss); std::cout << "\n";

    std::cout << "sqrt2*sqrt2: " << sqrt2*sqrt2 << "\n";
    std::cout << "sqrt2*sqrt2 == 2: " << (sqrt2*sqrt2 == 2) << std::endl;

    scoped_anum three(am);
    am.set(three, -3);

    std::cout << "(-3)^(1/5): " << root(three, 5) << "\n";
    std::cout << "sqrt(2)^(1/3): " << root(sqrt2, 3) << "\n";
    std::cout << "as-root-object(sqrt(2)^(1/3)): " << root_obj_pp(root(sqrt2, 3)) << "\n";
    std::cout << "(sqrt(2) + 1)^(1/3): " << root(sqrt2 + 1, 3) << "\n";
    std::cout << "as-root-object((sqrt(2) + 1)^(1/3)): " << root_obj_pp(root(sqrt2 + 1, 3)) << "\n";
    std::cout << "(sqrt(2) + gauss)^(1/5): " << root(sqrt2 + gauss, 5) << "\n";
    std::cout << "as-root-object(sqrt(2) + gauss)^(1/5): " << root_obj_pp(root(sqrt2 + gauss, 5)) << "\n";
    std::cout << "(sqrt(2) / sqrt(2)): " << sqrt2 / hidden_sqrt2 << "\n";
    std::cout << "(sqrt(2) / gauss): " << sqrt2 / gauss << "\n";
    std::cout << "(sqrt(2) / gauss) 30 digits: " << decimal_pp(sqrt2 / gauss, 30) << "\n";
    std::cout << "as-root-object(sqrt(2) / gauss): " << root_obj_pp(sqrt2 / gauss) << "\n";
    std::cout << "is_int(sqrt(2)^(1/3)): " << am.is_int(root(sqrt2, 3)) << "\n";

    scoped_anum tmp(am);
    scoped_anum four(am);
    am.set(four, 4);
    am.set(tmp, sqrt2);
    am.inv(tmp);
    std::cout << "1/sqrt(2): " << tmp << "\n";
    am.mul(tmp, four, tmp);
    std::cout << "4*1/sqrt(2): " << tmp << "  " << root_obj_pp(tmp) << "\n";
    am.mul(tmp, sqrt2, tmp);
    std::cout << "sqrt(2)*4*(1/sqrt2): " << tmp << "  " << root_obj_pp(tmp) << "\n";
    std::cout << "is_int(sqrt(2)*4*(1/sqrt2)): " << am.is_int(tmp) << ", after is-int: " << tmp << "\n";

    p = (998*x - 1414)*((x^2) - 15);
    std::cout << "p: " << p << "\n";
    rs1.reset();
    am.isolate_roots(p, rs1);

    std::cout << "is-rational(sqrt2): " << am.is_rational(sqrt2) << "\n";

    scoped_anum qr(am);
    am.set(qr, rs1[1]);

    std::cout << "qr: " << root_obj_pp(qr);
    std::cout << ", is-rational: " << am.is_rational(qr) << ", val: " << root_obj_pp(qr) << "\n";

    return;

    std::cout << "compare(" << sqrt2 << ", " << gauss << "): " << am.compare(sqrt2, gauss) << "\n";

    p = (x^16) - 136*(x^14) + 6476*(x^12) - 141912*(x^10) + 1513334*(x^8) - 7453176*(x^6) + 13950764*(x^4) - 5596840*(x^2) + 46225;
    std::cout << "p: " << p << "\n";
    rs1.reset();
    am.isolate_roots(p, rs1);
    display_anums(std::cout, rs1);
}
コード例 #4
0
ファイル: ptraverse.hpp プロジェクト: yesyestian/BNB-solver
  void cbranch ()
  {
    int tid;
    pthread_mutex_lock(&mMutRec);
    tid = mThreadId ++;
    mTids[tid] = tid;
    pthread_setspecific(mTidKey, (void*)(mTids + tid));
    MMRegistry::registerMemManager(mManagers[tid]);
    pthread_mutex_unlock(&mMutRec);
    SmartArrayPtr < Set > aq(mMaxLocalQueueSize);
    FixedVector < Set > ltq((Set*)aq, mMaxLocalQueueSize);
    Solution asolv[2];
    FixedVector < Solution > solv(asolv, 2);
    SmartArrayPtr < Set > alsetv(mMaxLocalSetBufferSize);
    FixedVector < Set > lsetv(alsetv, mMaxLocalSetBufferSize);
    SmartArrayPtr < Solution > alsolv(mMaxLocalSolutionBufferSize);
    FixedVector < Solution > lsolv(alsolv, mMaxLocalSolutionBufferSize);
    for(int step = 1; ; step ++) {
      Set s;
      if(ltq.empty()) {
        pthread_mutex_lock(&mMutTaskQueue);
        mStarv ++;
	mSteps = BNBMAX(mSteps, step);
        while(mTaskQueue.empty() && (mStarv != mNumThreads) && (mSteps < mLocalSteps)) {
	  mLocalCounters[tid].mStarv ++;
	  struct timeval tv;
	  double t1, t2;
	  gettimeofday(&tv, NULL);
	  t1 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001;
          pthread_cond_wait(&mCV, &mMutTaskQueue); 
	  gettimeofday(&tv, NULL);
	  t2 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001;
	  mLocalCounters[tid].mStarvTime += (t2 - t1);
        }       
	if(mSteps >= mLocalSteps) {
	  pthread_cond_broadcast(&mCV);
          pthread_mutex_unlock(&mMutTaskQueue);
          break;
	} else if(!mTaskQueue.empty()) {
          s = mTaskQueue.top ();
          mTaskQueue.pop ();
          mStarv --;
	  mLocalCounters[tid].mGet ++;
	  pthread_mutex_unlock(&mMutTaskQueue);
        } else {
	  pthread_cond_broadcast(&mCV);
          pthread_mutex_unlock(&mMutTaskQueue);
          break;
        }
      } else {
        s = ltq.back();
        ltq.pop_back();
      }
      if (!mSetFactory->discard (s, getRecord())){
        mSetFactory->branch (s, lsetv, lsolv, getRecord(),  mInfos + tid, ltq.size());
        typename ProblemFactory::ValueType rec = getRecord();
        while(!lsolv.empty()) {
          Solution s = lsolv.back();
          lsolv.pop_back();
          if(((Factory::getProblemType() == BNB_MAXIMIZE) && (s.getValue() > rec)) ||
	     ((Factory::getProblemType() == BNB_MINIMIZE) && (s.getValue() < rec))) {
            rec = s.getValue();
            if(!solv.empty())
              solv.pop_back();
            solv.push_back(s);
          }
        }
	updateRecord(rec);
        while(!lsetv.empty()) {
          Set s = lsetv.back();
          lsetv.pop_back();
          if(!mSetFactory->discard (s, rec))
            ltq.push_back(s);
          else 
            mInfos[tid].mDiscardedByRecord ++;         
        }
        
	mSteps = BNBMAX(mSteps, step);
	if(mSteps >= mLocalSteps) {
	  pthread_mutex_lock(&mMutTaskQueue);
	  if(mStarv)
	    pthread_cond_broadcast(&mCV);
	  pthread_mutex_unlock(&mMutTaskQueue);
	  break;
	}

        if((step % mUpdateRatio) == 0) {
          if(!ltq.empty()) {
	    pthread_mutex_lock(&mMutTaskQueue);
	    struct timeval tv;
	    double t1, t2;
	    gettimeofday(&tv, NULL);
	    t1 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001;
	    mLocalCounters[tid].mDonat ++;
	    for(int i = 0; i < mPutChunk; i ++) {
	      if(!ltq.empty()) {
                MMRegistry::registerMemManager(mAuxMemManager);
		Set s = ltq.back();
		mTaskQueue.push(s);
                MMRegistry::registerMemManager(mManagers[tid]);
		ltq.pop_back();
	        mLocalCounters[tid].mPut ++;
	      } else
		break;
	    }
	    mQLen = mTaskQueue.size();
	    mMaxQLen = BNBMAX(mQLen, mMaxQLen);
	    if(mStarv)
	      pthread_cond_broadcast(&mCV);
	    t2 = (double)tv.tv_sec + (double)tv.tv_usec * 0.000001;
	    mLocalCounters[tid].mDonatTime += t2 - t1;
	    pthread_mutex_unlock(&mMutTaskQueue);
	  }
        }
      } else {
        mInfos[tid].mDiscardedByRecord ++;
      }
    }
    
    pthread_mutex_lock(&mMutTaskQueue);
    while(!ltq.empty()) {
      Set s = ltq.back();
      ltq.pop_back();
      mTaskQueue.push(s);
    }   
    pushSolutions (solv, mInfos + tid);
    pthread_mutex_unlock(&mMutTaskQueue);
  }
コード例 #5
0
ファイル: test_quant_blas.cpp プロジェクト: naibaf7/caffe
TYPED_TEST(QuantBlasTest, TestGemmComparativeFloatQuant) {
  typedef typename TypeParam::Dtype Dtype;

  // Expect at most 5% error
  float percentile_eps = 0.05;

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  // Need to test > 64 dimension
  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);
  std::uniform_real_distribution<float> valRand(-2.0, 2.0);


  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp M = dimsRand(rngen);
    int_tp N = dimsRand(rngen);
    int_tp K = dimsRand(rngen);

    CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans;
    CBLAS_TRANSPOSE trans_B = boolRand(rngen) ? CblasTrans : CblasNoTrans;

    bool has_alpha = boolRand(rngen);
    bool has_beta = has_alpha ? boolRand(rngen) : true;

    bool alpha_with_quant = boolRand(rngen) && has_alpha;
    bool beta_with_quant = boolRand(rngen) && has_beta;

    float alpha_val;
    float beta_val;

    if (has_alpha) {
      alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0);
    } else {
      alpha_val = 0.0;
    }

    if (has_beta) {
      beta_val = beta_with_quant ? valRand(rngen) : float(1.0);
    } else {
      beta_val = 0.0;
    }

    vector<int_tp> A_shape(4, 1);
    vector<int_tp> B_shape(4, 1);
    vector<int_tp> C_shape(4, 1);

    A_shape[2] = M;
    A_shape[3] = K;
    B_shape[2] = K;
    B_shape[3] = N;
    C_shape[2] = M;
    C_shape[3] = N;

    Blob<float> A(A_shape, Caffe::GetDefaultDevice());
    Blob<float> B(B_shape, Caffe::GetDefaultDevice());
    Blob<float> C(C_shape, Caffe::GetDefaultDevice());
    Blob<float> C_result(C_shape, Caffe::GetDefaultDevice());

    Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> B_quant(B_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> C_quant(C_shape, Caffe::GetDefaultDevice());

    Blob<float> C_unquant(C_shape, Caffe::GetDefaultDevice());


    caffe_rng_gaussian(M * K, (float)0.0, (float)0.5,
                       A.mutable_cpu_data());
    caffe_rng_gaussian(K * N, (float)0.0, (float)0.5,
                       B.mutable_cpu_data());
    caffe_rng_gaussian(M * N, (float)0.0, (float)0.5,
                       C.mutable_cpu_data());

    caffe_copy(M * N, C.cpu_data(), C_result.mutable_cpu_data());

    QuantizerParameter qpm_a;
    QuantizerParameter qpm_b;
    QuantizerParameter qpm_c;
    QuantizerParameter qpm_alpha;
    QuantizerParameter qpm_beta;
    qpm_a.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_b.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_c.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_beta.set_mode(CAFFE_QUANT_OBSERVE);

    Quantizer<float, Dtype> aq(qpm_a);
    Quantizer<float, Dtype> bq(qpm_b);
    Quantizer<float, Dtype> cq(qpm_c);
    Quantizer<float, Dtype> alphaq(qpm_alpha);
    Quantizer<float, Dtype> betaq(qpm_beta);

    // Normal GEMM
    caffe_gemm<float>(
                trans_A, trans_B,
                M, N, K,
                alpha_val,
                A.cpu_data(), B.cpu_data(),
                beta_val,
                C_result.mutable_cpu_data());


    // Observe all values that will be relevant for quantization
    aq.ObserveIn_cpu(M * K, A.cpu_data());
    bq.ObserveIn_cpu(K * N, B.cpu_data());
    cq.ObserveIn_cpu(M * N, C.cpu_data());
    cq.ObserveIn_cpu(M * N, C_result.cpu_data());
    alphaq.ObserveIn_cpu(1, &alpha_val);
    betaq.ObserveIn_cpu(1, &beta_val);

    // Apply observed values to the quantizer
    aq.update();
    bq.update();
    cq.update();
    alphaq.update();
    betaq.update();

    // Quantize A, B and C
    aq.Forward_cpu(M * K, A.cpu_data(), A_quant.mutable_cpu_data());
    bq.Forward_cpu(K * N, B.cpu_data(), B_quant.mutable_cpu_data());
    cq.Forward_cpu(M * N, C.cpu_data(), C_quant.mutable_cpu_data());

    Dtype alpha_val_quant = has_alpha;
    Dtype beta_val_quant = has_beta;

    // Quantize alpha
    if (alpha_with_quant) {
      alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant);
    }

    // Quantize beta
    if (beta_with_quant) {
      betaq.Forward_cpu(1, &beta_val, &beta_val_quant);
    }

    /*
    std::cout << "C max:" << cq.in_quantizer_values().max << std::endl;
    std::cout << "C min:" << cq.in_quantizer_values().min << std::endl;
    std::cout << "C zero:" << cq.in_quantizer_values().zero << std::endl;
    std::cout << "C scale:" << cq.in_quantizer_values().scale << std::endl;
    std::cout << "C max:" << cq.out_quantizer_values().max << std::endl;
    std::cout << "C min:" << cq.out_quantizer_values().min << std::endl;
    std::cout << "C zero:" << cq.out_quantizer_values().zero << std::endl;
    std::cout << "C scale:" <<  cq.out_quantizer_values().scale << std::endl;
    */

    if (Caffe::mode() == Caffe::Brew::CPU) {
      caffe_gemm<Dtype>(
                  trans_A, trans_B,
                  M, N, K,
                  alpha_val_quant,
                  A_quant.cpu_data(), B_quant.cpu_data(),
                  beta_val_quant,
                  C_quant.mutable_cpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(bq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(cq.out_quantizer_values()));
    } else {
      Caffe::GetDefaultDevice()->template gemm<Dtype>(trans_A, trans_B,
                  M, N, K,
                  alpha_val_quant,
                  A_quant.gpu_data(), B_quant.gpu_data(),
                  beta_val_quant,
                  C_quant.mutable_gpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(bq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(cq.out_quantizer_values()));
    }

    cq.Backward_cpu(M * N, C_quant.cpu_data(), C_unquant.mutable_cpu_data());

    // print_matrix(A_quant.cpu_data(), M, K);
    // print_matrix(B_quant.cpu_data(), K, N);

    // print_matrix(C_quant.cpu_data(), M, N);
    // print_matrix(C_result.cpu_data(), M, N);
    // print_matrix(C_unquant.cpu_data(), M, N);

    const QuantizerValues cqv = cq.in_quantizer_values();
    float eps = std::max(std::abs(cqv.get_max<float>()),
                         std::abs(cqv.get_min<float>())) * percentile_eps;

    for (int_tp i = 0; i < M * N; ++i) {
      EXPECT_NEAR(C_unquant.cpu_data()[i], C_result.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(C_unquant.cpu_data()[i] - C_result.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}
コード例 #6
0
ファイル: test_quant_blas.cpp プロジェクト: naibaf7/caffe
TYPED_TEST(QuantBlasTest, TestGemvComparativeFloatQuant) {
  typedef typename TypeParam::Dtype Dtype;

  // Expect at most 5% error
  float percentile_eps = 0.05;

  std::random_device rdev;
  std::mt19937 rngen(rdev());

  // Need to test > 64 dimension
  std::uniform_int_distribution<int_tp> dimsRand(1, 256);
  std::uniform_int_distribution<int_tp> boolRand(0, 1);
  std::uniform_int_distribution<int_tp> factorRand(-25, 25);
  std::uniform_real_distribution<float> valRand(-2.0, 2.0);


  for (int_tp testIdx = 0; testIdx < 25; ++testIdx) {
    int_tp M = dimsRand(rngen);
    int_tp N = dimsRand(rngen);

    CBLAS_TRANSPOSE trans_A = boolRand(rngen) ? CblasTrans : CblasNoTrans;

    bool has_alpha = boolRand(rngen);
    bool has_beta = has_alpha ? boolRand(rngen) : true;

    bool alpha_with_quant = boolRand(rngen) && has_alpha;
    bool beta_with_quant = boolRand(rngen) && has_beta;

    float alpha_val;
    float beta_val;

    if (has_alpha) {
      alpha_val = alpha_with_quant ? valRand(rngen) : float(1.0);
    } else {
      alpha_val = 0.0;
    }

    if (has_beta) {
      beta_val = beta_with_quant ? valRand(rngen) : float(1.0);
    } else {
      beta_val = 0.0;
    }

    vector<int_tp> A_shape(4, 1);
    vector<int_tp> x_shape(4, 1);
    vector<int_tp> y_shape(4, 1);

    A_shape[2] = M;
    A_shape[3] = N;
    x_shape[3] = trans_A == CblasTrans ? M : N;
    y_shape[3] = trans_A == CblasTrans ? N : M;

    Blob<float> A(A_shape, Caffe::GetDefaultDevice());
    Blob<float> x(x_shape, Caffe::GetDefaultDevice());
    Blob<float> y(y_shape, Caffe::GetDefaultDevice());
    Blob<float> y_result(y_shape, Caffe::GetDefaultDevice());

    Blob<Dtype> A_quant(A_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> x_quant(x_shape, Caffe::GetDefaultDevice());
    Blob<Dtype> y_quant(y_shape, Caffe::GetDefaultDevice());

    Blob<float> y_unquant(y_shape, Caffe::GetDefaultDevice());


    caffe_rng_gaussian(M * N, (float)0.0, (float)0.5,
                       A.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? M : N,
                       (float)0.0, (float)0.5, x.mutable_cpu_data());
    caffe_rng_gaussian(trans_A == CblasTrans ? N : M,
                       (float)0.0, (float)0.5, y.mutable_cpu_data());

    caffe_copy(trans_A == CblasTrans ? N : M,
               y.cpu_data(), y_result.mutable_cpu_data());

    QuantizerParameter qpm_a;
    QuantizerParameter qpm_x;
    QuantizerParameter qpm_y;
    QuantizerParameter qpm_alpha;
    QuantizerParameter qpm_beta;
    qpm_a.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_x.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_y.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_alpha.set_mode(CAFFE_QUANT_OBSERVE);
    qpm_beta.set_mode(CAFFE_QUANT_OBSERVE);

    Quantizer<float, Dtype> aq(qpm_a);
    Quantizer<float, Dtype> xq(qpm_x);
    Quantizer<float, Dtype> yq(qpm_y);
    Quantizer<float, Dtype> alphaq(qpm_alpha);
    Quantizer<float, Dtype> betaq(qpm_beta);

    // Normal GEMM
    caffe_gemv<float>(
                trans_A,
                M, N,
                alpha_val,
                A.cpu_data(), x.cpu_data(),
                beta_val,
                y_result.mutable_cpu_data());


    // Observe all values that will be relevant for quantization
    aq.ObserveIn_cpu(M * N, A.cpu_data());
    xq.ObserveIn_cpu(trans_A == CblasTrans ? M : N, x.cpu_data());
    yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y.cpu_data());
    yq.ObserveIn_cpu(trans_A == CblasTrans ? N : M, y_result.cpu_data());
    alphaq.ObserveIn_cpu(1, &alpha_val);
    betaq.ObserveIn_cpu(1, &beta_val);

    // Apply observed values to the quantizer
    aq.update();
    xq.update();
    yq.update();
    alphaq.update();
    betaq.update();

    // Quantize A, B and C
    aq.Forward_cpu(M * N, A.cpu_data(), A_quant.mutable_cpu_data());
    xq.Forward_cpu(trans_A == CblasTrans ? M : N,
                   x.cpu_data(), x_quant.mutable_cpu_data());
    yq.Forward_cpu(trans_A == CblasTrans ? N : M,
                   y.cpu_data(), y_quant.mutable_cpu_data());

    Dtype alpha_val_quant = has_alpha;
    Dtype beta_val_quant = has_beta;

    // Quantize alpha
    if (alpha_with_quant) {
      alphaq.Forward_cpu(1, &alpha_val, &alpha_val_quant);
    }

    // Quantize beta
    if (beta_with_quant) {
      betaq.Forward_cpu(1, &beta_val, &beta_val_quant);
    }

    if (Caffe::mode() == Caffe::Brew::CPU) {
      caffe_gemv<Dtype>(trans_A, M, N,
                  alpha_val_quant,
                  A_quant.cpu_data(), x_quant.cpu_data(),
                  beta_val_quant,
                  y_quant.mutable_cpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));
    } else {
      Caffe::GetDefaultDevice()->template gemv<Dtype>(trans_A, M, N,
                  alpha_val_quant,
                  A_quant.gpu_data(), x_quant.gpu_data(),
                  beta_val_quant,
                  y_quant.mutable_gpu_data(),
                  alpha_with_quant ? &(alphaq.out_quantizer_values()) : nullptr,
                  &(aq.out_quantizer_values()),
                  &(xq.out_quantizer_values()),
                  beta_with_quant ? &(betaq.out_quantizer_values()) : nullptr,
                  &(yq.out_quantizer_values()));
    }

    yq.Backward_cpu(trans_A == CblasTrans ? N : M,
                    y_quant.cpu_data(), y_unquant.mutable_cpu_data());

    // print_matrix(A_quant.cpu_data(), M, K);
    // print_matrix(B_quant.cpu_data(), K, N);

    // print_matrix(C_quant.cpu_data(), M, N);
    // print_matrix(C_result.cpu_data(), M, N);
    // print_matrix(C_unquant.cpu_data(), M, N);

    const QuantizerValues cqv = yq.in_quantizer_values();
    float eps = std::max(std::abs(cqv.get_max<float>()),
                         std::abs(cqv.get_min<float>())) * percentile_eps;

    for (int_tp i = 0; i < (trans_A == CblasTrans ? N : M); ++i) {
      EXPECT_NEAR(y_unquant.cpu_data()[i], y_result.cpu_data()[i], eps);
      // One error is enough to abort
      if (fabs(y_unquant.cpu_data()[i] - y_result.cpu_data()[i]) >= eps) {
        break;
      }
    }
  }
}