void verifyThrNum::doJob(){ try{ de.coldDish(inputData, LEN, R, DATAFROM); de.coldDish(ptnData, LEN, R, PTNFROM); cout<<"input good"<<endl; }catch(int){ cout<<"Subprocess Exception.(verifyThrNum_doJob())"<<endl; errorFlag = 1; } if(errorFlag != 1){ for(unsigned k = 0; k< R; k++){ for(unsigned i= 0; i< LEN; i++){ mm(0, i)= inputData(k, i); mt(i, 0)= inputData(k, i); } axpy_prod(mt, mm, weight, false); } while(1){ string str; cout<<"choose pattern 0~2 to recognize(-1 to stop):"; getline (cin,str); if(str.compare("-1")==0) break; for(unsigned i= 0; i< ptnData.size2(); i++) pattern(0, i)= ptnData(atoi(str.c_str()), i); axpy_prod(pattern, weight, output); for(unsigned i=0; i <LEN; i++){ if(output(0, i)>0) recgPtn(0, i)= 1; else if(output(0, i)<0) recgPtn(0, i)= -1; } cout<<"pattern "<<str<<": "<<endl; de.print2Matrix(COL, pattern); cout<<"recognized:"<<endl; de.print2Matrix(COL, recgPtn); } } }
ublas::vector<ublas::matrix<double> > wishart_InvA_rnd(const int df, ublas::matrix<double>& S, const int mc) { // Generates wishart matrix allowing for singular wishart size_t p = S.size1(); ublas::vector<double> D(p); ublas::matrix<double> P(p, p); ublas::matrix<double> F(p, p); F = ublas::zero_matrix<double>(p, p); // make copy of S // ublas::matrix<double> SS(S); lapack::gesvd('A', 'A', S, D, P, F); // svd0(S, P, D, F); // P = trans(P); //! correct for singular matrix std::vector<size_t> ii; for (size_t i=0; i<D.size(); ++i) if (D(i) > norm_inf(D)*1e-9) ii.push_back(i); size_t r = ii.size(); ublas::indirect_array<> idx(r); for (size_t i=0; i<r; ++i) idx(i) = ii[i]; ublas::indirect_array<> irow(p); for (size_t i=0; i<irow.size(); ++ i) irow(i) = i; ublas::matrix<double> Q(p, r); // Q = prod(project(P, irow, idx), diagm(ublas::apply_to_all<functor::inv_sqrt<double> >(project(D, idx)))); // rprod does not seem any faster than diagonalizing D before multiplication // Q = rprod(project(P, irow, idx), ublas::apply_to_all<functor::inv_sqrt<double> >(D)); axpy_prod(project(trans(P), irow, idx), diagm(ublas::apply_to_all<functor::inv_sqrt<double> >(project(D, idx))), Q, true); // generate mc samples ublas::vector<ublas::matrix<double> > K(mc); for (int i=0; i<mc; ++i) K(i) = wishart_1(df, Q, p, r); return K; }
int main(int argc, char *argv[]) { int64_t t1, t2, T1=0, T2=0; Kratos::OpenCL::DeviceGroup DeviceGroup(CL_DEVICE_TYPE_GPU, true); DeviceGroup.AddCLSearchPath("../"); cl_uint Program = DeviceGroup.BuildProgramFromFile("opencl_spmv.cl", "-cl-fast-relaxed-math"); cl_uint Kernel = DeviceGroup.RegisterKernel(Program, "CSR_Matrix_Vector_Multiply", WORKGROUP_SIZE); Kratos::CompressedMatrix A; Kratos::Vector X, Y1, Y2; Kratos::ReadMatrixMarketMatrix(argv[1], A); Kratos::ReadMatrixMarketVector(argv[2], X); Y1.resize(A.size1()); Y2.resize(A.size1()); cl_uint A_RowIndices = DeviceGroup.CreateBuffer((A.size1() + 1) * sizeof(cl_ulong), CL_MEM_READ_ONLY); cl_uint A_ColumnIndices = DeviceGroup.CreateBuffer(A.nnz() * sizeof(cl_ulong), CL_MEM_READ_ONLY); cl_uint A_Values = DeviceGroup.CreateBuffer(A.nnz() * sizeof(cl_double), CL_MEM_READ_ONLY); cl_uint X_Values = DeviceGroup.CreateBuffer(A.size1() * sizeof(cl_double), CL_MEM_READ_ONLY); cl_uint Y_Values = DeviceGroup.CreateBuffer(A.size1() * sizeof(cl_double), CL_MEM_WRITE_ONLY); DeviceGroup.CopyBuffer(A_RowIndices, Kratos::OpenCL::HostToDevice, Kratos::OpenCL::VoidPList(1, &A.index1_data()[0])); DeviceGroup.CopyBuffer(A_ColumnIndices, Kratos::OpenCL::HostToDevice, Kratos::OpenCL::VoidPList(1, &A.index2_data()[0])); DeviceGroup.CopyBuffer(A_Values, Kratos::OpenCL::HostToDevice, Kratos::OpenCL::VoidPList(1, &A.value_data()[0])); DeviceGroup.CopyBuffer(X_Values, Kratos::OpenCL::HostToDevice, Kratos::OpenCL::VoidPList(1, &X[0])); DeviceGroup.SetBufferAsKernelArg(Kernel, 0, A_RowIndices); DeviceGroup.SetBufferAsKernelArg(Kernel, 1, A_ColumnIndices); DeviceGroup.SetBufferAsKernelArg(Kernel, 2, A_Values); DeviceGroup.SetBufferAsKernelArg(Kernel, 3, X_Values); DeviceGroup.SetBufferAsKernelArg(Kernel, 4, Y_Values); DeviceGroup.SetKernelArg(Kernel, 5, A.size1()); DeviceGroup.SetLocalMemAsKernelArg(Kernel, 6, (ROWS_PER_WORKGROUP + 1) * sizeof(cl_ulong)); DeviceGroup.SetLocalMemAsKernelArg(Kernel, 7, WORKGROUP_SIZE * sizeof(cl_double)); std::cout << "Launch size: " << A.size1() * LOCAL_WORKGROUP_SIZE + 1 << std::endl; for (unsigned int i = 0; i < N; i++) { t1 = Timer(); DeviceGroup.ExecuteKernel(Kernel, A.size1() * LOCAL_WORKGROUP_SIZE + 1); DeviceGroup.Synchronize(); t2 = Timer(); if (i == 0 || t2 - t1 < T1) { T1 = t2 - t1; } } DeviceGroup.CopyBuffer(Y_Values, Kratos::OpenCL::DeviceToHost, Kratos::OpenCL::VoidPList(1, &Y1[0])); for (unsigned int i = 0; i < N; i++) { t1 = Timer(); axpy_prod(A, X, Y2); t2 = Timer(); if (i == 0 || t2 - t1 < T2) { T2 = t2 - t1; } } for (cl_uint i = 0; i < A.size1(); i++) { if (fabs(Y1[i] - Y2[i]) > 1e-10) { std::cout << "Error in location " << i << ": " << Y1[i] << " " << Y2[i] << std::endl; } } std::cout << "Norm_2 of Y1 is " << norm_2(Y1) << "." << std::endl; std::cout << "Norm_2 of Y2 is " << norm_2(Y2) << "." << std::endl; std::cout << "Test finished." << std::endl << "OpenCL SpMV:\t" << T1 / 1000000.00 << " ms" << std::endl << "uBlas:\t\t" << T2 / 1000000.00 << " ms" << std::endl; return 0; }