void test_svd(const std::string & fn, ScalarType EPS) { std::size_t sz1, sz2; //read matrix // sz1 = 2048, sz2 = 2048; // std::vector<ScalarType> in(sz1 * sz2); // random_fill(in); // read file std::fstream f(fn.c_str(), std::fstream::in); //read size of input matrix read_matrix_size(f, sz1, sz2); std::size_t to = std::min(sz1, sz2); viennacl::matrix<ScalarType> Ai(sz1, sz2), Aref(sz1, sz2), QL(sz1, sz1), QR(sz2, sz2); read_matrix_body(f, Ai); std::vector<ScalarType> sigma_ref(to); read_vector_body(f, sigma_ref); f.close(); // viennacl::fast_copy(&in[0], &in[0] + in.size(), Ai); Aref = Ai; viennacl::tools::timer timer; timer.start(); viennacl::linalg::svd(Ai, QL, QR); viennacl::backend::finish(); double time_spend = timer.get(); viennacl::matrix<ScalarType> result1(sz1, sz2), result2(sz1, sz2); result1 = viennacl::linalg::prod(QL, Ai); result2 = viennacl::linalg::prod(result1, trans(QR)); ScalarType sigma_diff = sigmas_compare(Ai, sigma_ref); ScalarType prods_diff = matrix_compare(result2, Aref); bool sigma_ok = (fabs(sigma_diff) < EPS) && (fabs(prods_diff) < std::sqrt(EPS)); //note: computing the product is not accurate down to 10^{-16}, so we allow for a higher tolerance here printf("%6s [%dx%d] %40s sigma_diff = %.6f; prod_diff = %.6f; time = %.6f\n", sigma_ok?"[[OK]]":"[FAIL]", (int)Aref.size1(), (int)Aref.size2(), fn.c_str(), sigma_diff, prods_diff, time_spend); if (!sigma_ok) exit(EXIT_FAILURE); }
void test_nmf(std::size_t m, std::size_t k, std::size_t n) { std::vector<ScalarType> stl_w(m * k); std::vector<ScalarType> stl_h(k * n); viennacl::matrix<ScalarType> v_ref(m, n); viennacl::matrix<ScalarType> w_ref(m, k); viennacl::matrix<ScalarType> h_ref(k, n); fill_random(stl_w); fill_random(stl_h); viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w_ref); viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h_ref); v_ref = viennacl::linalg::prod(w_ref, h_ref); //reference // Fill again with random numbers: fill_random(stl_w); fill_random(stl_h); viennacl::matrix<ScalarType> w_nmf(m, k); viennacl::matrix<ScalarType> h_nmf(k, n); viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w_nmf); viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h_nmf); viennacl::linalg::nmf_config conf; viennacl::linalg::nmf(v_ref, w_nmf, h_nmf, conf); viennacl::matrix<ScalarType> v_nmf = viennacl::linalg::prod(w_nmf, h_nmf); float diff = matrix_compare(v_ref, v_nmf); bool diff_ok = fabs(diff) < EPS; long iterations = static_cast<long>(conf.iters()); printf("%6s [%lux%lux%lu] diff = %.6f (%ld iterations)\n", diff_ok ? "[[OK]]":"[FAIL]", m, k, n, diff, iterations); if (!diff_ok) exit(EXIT_FAILURE); }
void record_demo_frame(void) { vms_angvec pbh; //mprintf(0, "Record start..."); mprintf(0, "Curtime = %6i, Last time = %6i\n", Player_stats.time_total, Demo_last_time); if (GameTime - Demo_last_time >= 65536) { Demo_last_time = GameTime; if (Demo_record_index < MAX_DEMO_RECS) { demorec *demo_ptr = &Demo_records[Demo_record_index]; vms_matrix tempmat; demo_ptr->time = GameTime - Demo_start_time; demo_ptr->x = Player->pos.x; demo_ptr->y = Player->pos.y; demo_ptr->z = Player->pos.z; vm_extract_angles_matrix(&pbh, &Player->orient); vm_angles_2_matrix(&tempmat, &pbh); matrix_compare(&tempmat, &Player->orient); demo_ptr->p = pbh.p; demo_ptr->b = pbh.b; demo_ptr->h = pbh.h; demo_ptr->segnum = Player->segnum; Demo_record_index++; Num_demo_recs = Demo_record_index; // if (firing) // demo_ptr->specials = 1; // else // demo_ptr->specials = 0; } } //mprintf(0, "Record end\n"); }
void test_eigen(const std::string& fn, bool is_symm) { std::cout << "Reading..." << "\n"; std::size_t sz; // read file std::fstream f(fn.c_str(), std::fstream::in); //read size of input matrix read_matrix_size(f, sz); bool is_row = viennacl::is_row_major<MatrixLayout>::value; if (is_row) std::cout << "Testing row-major matrix of size " << sz << "-by-" << sz << std::endl; else std::cout << "Testing column-major matrix of size " << sz << "-by-" << sz << std::endl; viennacl::matrix<ScalarType> A_input(sz, sz), A_ref(sz, sz), Q(sz, sz); // reference vector with reference values from file std::vector<ScalarType> eigen_ref_re(sz); // calculated real eigenvalues std::vector<ScalarType> eigen_re(sz); // calculated im. eigenvalues std::vector<ScalarType> eigen_im(sz); // read input matrix from file read_matrix_body(f, A_input); // read reference eigenvalues from file read_vector_body(f, eigen_ref_re); f.close(); A_ref = A_input; std::cout << "Calculation..." << "\n"; Timer timer; timer.start(); // Start the calculation if(is_symm) viennacl::linalg::qr_method_sym(A_input, Q, eigen_re); else viennacl::linalg::qr_method_nsm(A_input, Q, eigen_re, eigen_im); /* std::cout << "\n\n Matrix A: \n\n"; matrix_print(A_input); std::cout << "\n\n"; std::cout << "\n\n Matrix Q: \n\n"; matrix_print(Q); std::cout << "\n\n"; */ double time_spend = timer.get(); std::cout << "Verification..." << "\n"; bool is_hessenberg = check_hessenberg(A_input); bool is_tridiag = check_tridiag(A_input); ublas::matrix<ScalarType> A_ref_ublas(sz, sz), A_input_ublas(sz, sz), Q_ublas(sz, sz), result1(sz, sz), result2(sz, sz); viennacl::copy(A_ref, A_ref_ublas); viennacl::copy(A_input, A_input_ublas); viennacl::copy(Q, Q_ublas); // compute result1 = ublas::prod(Q_ublas, A_input_ublas); (terribly slow when using ublas directly) for (std::size_t i=0; i<result1.size1(); ++i) for (std::size_t j=0; j<result1.size2(); ++j) { ScalarType value = 0; for (std::size_t k=0; k<Q_ublas.size2(); ++k) value += Q_ublas(i, k) * A_input_ublas(k, j); result1(i,j) = value; } // compute result2 = ublas::prod(A_ref_ublas, Q_ublas); (terribly slow when using ublas directly) for (std::size_t i=0; i<result2.size1(); ++i) for (std::size_t j=0; j<result2.size2(); ++j) { ScalarType value = 0; for (std::size_t k=0; k<A_ref_ublas.size2(); ++k) value += A_ref_ublas(i, k) * Q_ublas(k, j); result2(i,j) = value; } ScalarType prods_diff = matrix_compare(result1, result2); ScalarType eigen_diff = vector_compare(eigen_re, eigen_ref_re); bool is_ok = is_hessenberg; if(is_symm) is_ok = is_ok && is_tridiag; is_ok = is_ok && (eigen_diff < EPS); is_ok = is_ok && (prods_diff < EPS); // std::cout << A_ref << "\n"; // std::cout << A_input << "\n"; // std::cout << Q << "\n"; // std::cout << eigen_re << "\n"; // std::cout << eigen_im << "\n"; // std::cout << eigen_ref_re << "\n"; // std::cout << eigen_ref_im << "\n"; // std::cout << result1 << "\n"; // std::cout << result2 << "\n"; // std::cout << eigen_ref << "\n"; // std::cout << eigen << "\n"; printf("%6s [%dx%d] %40s time = %.4f\n", is_ok?"[[OK]]":"[FAIL]", (int)A_ref.size1(), (int)A_ref.size2(), fn.c_str(), time_spend); printf("tridiagonal = %d, hessenberg = %d prod-diff = %f eigen-diff = %f\n", is_tridiag, is_hessenberg, prods_diff, eigen_diff); std::cout << std::endl << std::endl; if (!is_ok) exit(EXIT_FAILURE); }
int main(int argc, char *argv[]) { struct tms usage; FILE *finp; int i,j, ticks; int numinfirst; char chkfile[255]; i=0; dump_file=NULL; do_cluster=do_pairwise_cluster; srandom(563573); bzero(&prog_opts,sizeof(ProgOptionsType)); outf=stdout; // set default distance function dist = d2; distpair= d2pair; #ifdef MPI MPI_Init(&argc, &argv); MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); #endif if(myid==0) { // Master process_options(argc, argv); } else { process_slave_options(argc, argv); } if (prog_opts.show_version || (argc==1)) { if (myid==0) printf("Version \n%s\n",version); #ifdef MPI MPI_Finalize(); #endif exit(0); } // Allocate space for the RC table for big words rc_big = calloc(BIG_WORD_TSIZE, sizeof(SeqElt)); // work is an array of work blocks. If non-parallel, there'll only // be one. work[0] acts a template work = (WorkPtr) calloc(num_threads,sizeof(WorkBlock)); work->filename = argv[optind]; work->index = NULL; if(prog_opts.do_dump) dump_file = fopen(prog_opts.dname,"w"); #ifdef MPI if (numprocs > 1) if (myid>0) { // slaves if (prog_opts.split) { MPI_Finalize(); return 0; } handleMPISlaveSetup(&num_seqs); initialise(work, prog_opts.edfile); internalTest(); perform_clustering(work); transmitMPISlaveResponse(work); if (prog_opts.show_perf) show_performance(outf); MPI_Finalize(); exit(0); } #else if (numprocs > 1) { printf("This version of wcd is not compiled with MPI\n"); printf("You cannot run it with a multiple processes\n"); printf("Either only run it with one process or do a \n"); printf(" ./configure --enable-mpi\n"); printf(" make clean\n"); printf(" make \n"); exit(5); } #endif // work out number of sequences // if the user has specified a value for num_seqs then // use that, else use the number of sequences in the file num_seqs = count_seqs(argv[optind], &data_size)+reindex_value; seq = (SeqPtr *) calloc(num_seqs,sizeof(SeqPtr)); seqInfo = (SeqInfoPtr) calloc(num_seqs,sizeof(SeqInfoStruct)); tree= (UnionFindPtr) calloc(num_seqs,sizeof(UnionFindStruct)); data= (SeqPtr) calloc(data_size,sizeof(SeqElt)); init_dummy_sequences(); #ifndef AUXINFO seqID = (SeqIDPtr) calloc(num_seqs,sizeof(SeqIDStruct)); #endif if (seq == NULL) { perror("SeqStruct allocation"); exit(50); } numinfirst = global_i_end = num_seqs; global_j_beg = 0; // if merging, need to check the other file too if (prog_opts.domerge || prog_opts.doadd ) { global_j_beg = global_i_end; num_seqs = handleMerge(argv[optind+2], num_seqs); if (prog_opts.doadd) global_i_end = num_seqs; } initialise(work, prog_opts.edfile); if (data == NULL) { sprintf(chkfile,"Main data store (%d bytes)",data_size); perror(chkfile); exit(51); } for(i=0; i<num_seqs; i++) seqInfo[i].flag=0; // reopen sequence file for reading finp = fopen(argv[optind],"r"); if (finp == NULL) { perror(argv[optind]); exit(51); } // Some messy stuff to hande auxiliary options // Skip to next comment on first reading if (prog_opts.pairwise==1) { sscanf(argv[optind+1], "%d", &i); sscanf(argv[optind+2], "%d", &j); show_pairwise(finp,i,j); return 0; } if (prog_opts.statgen) { compared2nummatches(finp,prog_opts.statgen); return 0; } if (prog_opts.range) { sscanf(argv[optind+1], "%d", &global_i_beg); sscanf(argv[optind+2], "%d", &global_i_end); } if (prog_opts.show_comp==41) { char * fname; fname = malloc(255); sscanf(argv[optind+1], "%s", fname); read_sequences(finp,reindex_value,num_seqs); checkfile = fopen(fname,"r"); sscanf(argv[optind+2], "%d", &j); while (fscanf(checkfile,"%d", &i) != -1) { do_compare(finp,i,j,1); } return 0; } if (prog_opts.show_comp) { sscanf(argv[optind+1], "%d", &i); sscanf(argv[optind+2], "%d", &j); //printf("Comparing %d and %d of %d flag %d\n",i,j,num_seqs,prog_opts.flag); read_sequences(finp,reindex_value,num_seqs); do_compare(finp,i,j,prog_opts.flag); return 0; } if (prog_opts.show_index) { show_sequence(finp, prog_opts.index,prog_opts.flag); return 0; } // Now read in the sequences if (do_cluster == do_pairwise_cluster||do_cluster==do_MPImaster_cluster||do_cluster == do_suffix_cluster) read_sequences(finp,reindex_value,numinfirst); else init_sequences(finp,reindex_value,numinfirst); fclose(finp); //printf("%d Allocated %d, start=%d, last=%d\n",num_seqs,data_size,data,seq[num_seqs-1].seq); if (prog_opts.split) { process_split(prog_opts.clfname1, prog_opts.split); #ifdef MPI MPI_Finalize(); #endif return 0; } if (prog_opts.consfname1) process_constraints(prog_opts.consfname1,0); if (prog_opts.clustercomp) { cluster_compare(argv[optind+1]); return 0; } // If merging or adding need to open the second sequence file if (prog_opts.domerge || prog_opts.doadd) { finp = fopen(argv[optind+2], "r"); if (finp == NULL) { perror(argv[optind]); exit(1); } if (do_cluster == do_pairwise_cluster) read_sequences(finp,numinfirst+reindex_value,num_seqs); else init_sequences(finp,numinfirst+reindex_value,num_seqs); get_clustering(argv[optind+1],0); if (prog_opts.domerge) get_clustering(argv[optind+3],numinfirst); } if (prog_opts.init_cluster) get_clustering(prog_opts.clfname1, 0); if (prog_opts.recluster) reclustering(work,prog_opts.clfname2); else { // This really assumes there is only one thread for suffix if (prog_opts.pairwise==2) { matrix_compare(finp); return 0; } work->workflag = prog_opts.noninterleavednlc;//kludge for suffixarray global_j_end = num_seqs; perform_clustering(work); #ifdef MPI if (myid>0) transmitMPISlaveResponse(work); #endif } if (prog_opts.show_ext) show_EXT(outf); if (prog_opts.show_histo) show_histogram(work); if (prog_opts.show_clust&1) show_clusters(outf); if (prog_opts.show_clust&8) produce_clusters(prog_opts.clthresh,prog_opts.dirname); if (prog_opts.show_perf) show_performance(outf); if (prog_opts.do_dump) { strcpy(chkfile,prog_opts.dname); strcat(chkfile,"-FIN"); fclose(dump_file); dump_file = fopen(chkfile,"w"); times(&usage); ticks = sysconf(_SC_CLK_TCK); fprintf(dump_file,"Completed %ld %ld", usage.tms_utime/ticks, usage.tms_stime*1000/ticks); fclose(dump_file); } if (prog_opts.show_version) fprintf(outf,"\n%s\n",version); fclose(outf); #ifdef MPI MPI_Finalize(); #endif exit(0); }
/* Try various ways to do matmul and time them. Tiled algorithms * running serially; multi-threaded QUARK runtime with tiled * algorithms; and direct serial computation over standard layout. */ int main_algorithm(int NB, int N, int THREADS) { int i, j, k, nerr=0; int BB = N/NB; double *A = (double*)malloc(N*N*sizeof(double)); double *Ablk = (double*)malloc(N*N*sizeof(double)); double *B = (double*)malloc(N*N*sizeof(double)); double *Bblk = (double*)malloc(N*N*sizeof(double)); double *C_direct = (double*)malloc(N*N*sizeof(double)); double *C = (double*)malloc(N*N*sizeof(double)); double *Cblk = (double*)malloc(N*N*sizeof(double)); double *C_quark = (double*)malloc(N*N*sizeof(double)); double *C_quark_blk = (double*)malloc(N*N*sizeof(double)); struct timeval tstart, tend, tdiff; double t_blk=0, t_quark=0, t_direct=0; // Initialize for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i+j*N] = (double)1.0+i; B[i+j*N] = (double)2.0+i+j; C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0; } } matrix_print("Printing A", A, N); matrix_print("Printing B", B, N); matrix_print("Printing C before computation", C, N); // Move from F77 to BDL std_to_bdl( A, Ablk, N, NB ); std_to_bdl( B, Bblk, N, NB ); std_to_bdl( C, Cblk, N, NB ); std_to_bdl( C_quark, C_quark_blk, N, NB ); /* ORIGINAL TILED ROUTINE */ /* This is the code for the serial tile-by-tile multiplication */ printf("Doing matrix multiplication using serial tile-by-tile algorithm\n"); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB); gettimeofday( &tend, NULL ); t_blk = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); bdl_to_std( C, Cblk, N, NB ); matrix_print("Printing C produced by serial tile-algorithm after computation", C, N); printf("\n"); /* QUARK PARALLEL TILED ROUTINE */ /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */ printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n"); Quark *quark = QUARK_New(THREADS); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB); QUARK_Barrier( quark ); gettimeofday( &tend, NULL ); t_quark = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); QUARK_Delete(quark); bdl_to_std( C_quark, C_quark_blk, N, NB ); matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N); printf("\n"); /* DIRECT COMPUTATION OVER STANDARD LAYOUT */ /* Compute direct C if desired */ printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n"); gettimeofday( &tstart, NULL ); matmul ( A, B, C_direct, N ); gettimeofday( &tend, NULL ); t_direct = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) ); matrix_print("Printing C produced by direct matmul after computation", C_direct, N); printf("\n"); /* Check for errors */ printf("Comparing result matrices (direct versus QUARK)\n"); nerr = matrix_compare( C_direct, C_quark, N ); printf("Number of differences: %d\n", nerr); printf("\n"); printf("Summary of time taken\n"); printf("Direct SerialBlock QUARK(%d threads)\n", THREADS); printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark); free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk); return 0; }