VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo) { viennacl::backend::mem_handle v1_handle; viennacl::backend::mem_handle A_handle; if (init_vector(v1_handle, x) != ViennaCLSuccess) return ViennaCLGenericFailure; if (init_matrix(A_handle, A) != ViennaCLSuccess) return ViennaCLGenericFailure; switch (x->precision) { case ViennaCLFloat: { viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc); viennacl::matrix_base<float> mat(A_handle, A->size1, A->start1, A->stride1, A->internal_size1, A->size2, A->start2, A->stride2, A->internal_size2, A->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper) viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag()); else viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag()); } else { if (uplo == ViennaCLUpper) viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag()); else viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag()); } return ViennaCLSuccess; } case ViennaCLDouble: { viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc); viennacl::matrix_base<double> mat(A_handle, A->size1, A->start1, A->stride1, A->internal_size1, A->size2, A->start2, A->stride2, A->internal_size2, A->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper) viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag()); else viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag()); } else { if (uplo == ViennaCLUpper) viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag()); else viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag()); } return ViennaCLSuccess; } default: return ViennaCLGenericFailure; } }
/* assuming slaves (workers)) are all homogenous, let them all do the calculations regarding primes sieving, calculating the smoothness base and the modular roots */ int main(int argc, char **argv) { MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_group_size); int len; MPI_Get_processor_name(processor_name, &len); gettimeofday(&start_global, NULL); print_lib_version(); mpz_init(N); mpz_t B; mpz_init(B); unsigned long int uBase; int64_t nb_primes; modular_root_t *modular_roots; uint64_t i, j; if (argc < 2) { PRINT(my_rank, "usage: %s Number_to_factorize\n", argv[0]); exit(2); } if (mpz_init_set_str(N, argv[1], 10) == -1) { PRINT(my_rank, "Cannot load N %s\n", argv[1]); exit(2); } mpz_t sqrtN, rem; mpz_init(sqrtN); mpz_init(rem); mpz_sqrtrem(sqrtN, rem, N); if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */ mpz_add_ui(sqrtN, sqrtN, 1); else /* N is a perfect square, factored! */ { PRINT(my_rank, "\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN)); return 0; } if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */ { PRINT(my_rank, "N:%s is prime\n", mpz_get_str(NULL, 10, N)); exit(0); } OPEN_LOG_FILE("freq"); //-------------------------------------------------------- // calculate the smoothness base for the given N //-------------------------------------------------------- get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */ uBase = mpz_get_ui(B); PRINT(my_rank, "n: %s\tBase: %s\n", mpz_get_str(NULL, 10, N), mpz_get_str(NULL, 10, B)); //-------------------------------------------------------- // sieve primes that are less than the smoothness base using Eratosthenes sieve //-------------------------------------------------------- START_TIMER(); nb_primes = sieve_primes_up_to((int64_t) (uBase)); PRINT(my_rank, "\tPrimes found %" PRId64 " [Smoothness Base %lu]\n", nb_primes, uBase); STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done"); //-------------------------------------------------------- // fill the primes array with primes to which n is a quadratic residue //-------------------------------------------------------- START_TIMER(); primes = calloc(nb_primes, sizeof(int64_t)); nb_qr_primes = fill_primes_with_quadratic_residue(primes, N); /*for(i=0; i<nb_qr_primes; i++) PRINT(my_rank, "%" PRId64 "\n", primes[i]);*/ PRINT(my_rank, "\tN-Quadratic primes found %" PRId64 "\n", nb_qr_primes); STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done"); //-------------------------------------------------------- // calculate modular roots //-------------------------------------------------------- START_TIMER(); modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t)); mpz_t tmp, r1, r2; mpz_init(tmp); mpz_init(r1); mpz_init(r2); for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(tmp, (unsigned long) primes[i]); mpz_sqrtm(r1, N, tmp); /* calculate the modular root */ mpz_neg(r2, r1); /* -q mod n */ mpz_mod(r2, r2, tmp); modular_roots[i].root1 = mpz_get_ui(r1); modular_roots[i].root2 = mpz_get_ui(r2); } mpz_clear(tmp); mpz_clear(r1); mpz_clear(r2); STOP_TIMER_PRINT_TIME("Modular roots calculation done"); //-------------------------------------------------------- // ***** initialize the matrix ***** //-------------------------------------------------------- if (my_rank == 0) /* only the master have the matrix */ { START_TIMER(); init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes); mpz_init2(tmp_matrix_row, nb_qr_primes); STOP_TIMER_PRINT_TIME("Matrix initialized"); } //-------------------------------------------------------- // [Sieving] - everyones sieves including the master //-------------------------------------------------------- START_TIMER(); mpz_t x, sieving_index, next_sieving_index, relative_start, global_step; unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */ int LOCAL_SIEVING_ROUNDS = 10; /* number of iterations a worker sieves before communicating results to the master */ unsigned long sieving_round = 0; unsigned long nb_big_rounds = 0; uint64_t p_pow; smooth_number_t *x_squared; x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t)); if (my_rank == 0) smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET, sizeof(smooth_number_t)); else temp_slaves_smooth_numbers = calloc(500, sizeof(smooth_number_t)); /* TODO: this is not properly correct, using a linkedlist is better to keep track of temporary * smooth numbers at the slaves nodes however it's pretty rare to find 500 smooth numbers in * 50000 * 10 interval. */ mpz_init_set(x, sqrtN); mpz_init(global_step); mpz_init(relative_start); mpz_init(sieving_index); mpz_init(next_sieving_index); mpz_t p; mpz_init(p); mpz_t str; mpz_init_set(str, sieving_index); PRINT(my_rank, "\n[%s] Sieving ...\n", processor_name); //-------------------------------------------------------- // Init before sieving //-------------------------------------------------------- for (i = 0; i < SIEVING_STEP; i++) { mpz_init(x_squared[i].value_x); mpz_init(x_squared[i].value_x_squared); mpz_init2(x_squared[i].factors_vect, nb_qr_primes); mpz_add_ui(x, x, 1); } int nb_smooth_per_round = 0; char s[512]; //-------------------------------------------------------- // WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET for master // Or master asked for more smooth numbers from slaves //-------------------------------------------------------- while (1) { mpz_set_ui(global_step, nb_big_rounds); /* calculates the coordinate where the workers start sieving from */ mpz_mul_ui(global_step, global_step, (unsigned long) mpi_group_size); mpz_mul_ui(global_step, global_step, SIEVING_STEP); mpz_mul_ui(global_step, global_step, LOCAL_SIEVING_ROUNDS); mpz_add(global_step, global_step, sqrtN); mpz_set_ui(relative_start, SIEVING_STEP); mpz_mul_ui(relative_start, relative_start, LOCAL_SIEVING_ROUNDS); mpz_mul_ui(relative_start, relative_start, (unsigned long) my_rank); mpz_add(relative_start, relative_start, global_step); mpz_set(sieving_index, relative_start); mpz_set(next_sieving_index, relative_start); for (sieving_round = 0; sieving_round < LOCAL_SIEVING_ROUNDS; /* each slave sieves for LOCAL_SIEVING_ROUNDS rounds */ sieving_round++) { nb_smooth_per_round = 0; mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */ mpz_set(sieving_index, next_sieving_index); if (my_rank == 0) { printf("\r"); printf( "\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "", mpz_get_str(NULL, 10, sieving_index), nb_global_smooth_numbers_found, nb_qr_primes); fflush(stdout); } for (i = 0; i < SIEVING_STEP; i++) { mpz_set(x_squared[i].value_x, x); mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */ mpz_sub(x_squared[i].value_x_squared, x_squared[i].value_x_squared, N); mpz_clear(x_squared[i].factors_vect); mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */ mpz_add_ui(x, x, 1); } mpz_set(next_sieving_index, x); //-------------------------------------------------------- // eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth //-------------------------------------------------------- for (i = 0; i < nb_qr_primes; i++) { mpz_set_ui(p, (unsigned long) primes[i]); mpz_set(x, sieving_index); /* get the first multiple of p that is directly larger that sieving_index * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2 * are also multiples of p */ get_sieving_start_index(x, x, p, modular_roots[i].root1); mpz_set(str, x); mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */ for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); /* eliminate all factors of p */ if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */ { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } /* sieve next element located p steps from here */ } /* same goes for root2 */ if (modular_roots[i].root2 == modular_roots[i].root1) continue; mpz_set(x, sieving_index); get_sieving_start_index(x, x, p, modular_roots[i].root2); mpz_set(str, x); mpz_sub(x, x, sieving_index); for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) { p_pow = mpz_remove(x_squared[j].value_x_squared, x_squared[j].value_x_squared, p); if (p_pow & 1) { mpz_setbit(x_squared[j].factors_vect, i); } if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) { save_smooth_number(x_squared[j]); nb_smooth_per_round++; } } } } if (my_rank == 0) /* master gathers smooth numbers from slaves */ { gather_smooth_numbers(); notify_slaves(); } else /* slaves send their smooth numbers to master */ { send_smooth_numbers_to_master(); nb_global_smooth_numbers_found = get_server_notification(); } if (nb_global_smooth_numbers_found >= nb_qr_primes + NB_VECTORS_OFFSET) break; nb_big_rounds++; } STOP_TIMER_PRINT_TIME("\nSieving DONE"); if (my_rank == 0) { uint64_t t = 0; //-------------------------------------------------------- //the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number() //-------------------------------------------------------- START_TIMER(); gauss_elimination(&matrix); STOP_TIMER_PRINT_TIME("\nGauss elimination done"); uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */ int nb_linear_relations = 0; mpz_t linear_relation_z, solution_z; mpz_init(linear_relation_z); mpz_init(solution_z); get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/ while (mpz_cmp_ui(linear_relation_z, 0) == 0) { nb_linear_relations++; get_matrix_row(linear_relation_z, &matrix, row_index--); } PRINT(my_rank, "\tLinear dependent relations found : %d\n", nb_linear_relations); //-------------------------------------------------------- // Factor //-------------------------------------------------------- //We use the last linear relation to reconstruct our solution START_TIMER(); PRINT(my_rank, "%s", "\nFactorizing..\n"); mpz_t solution_X, solution_Y; mpz_init(solution_X); mpz_init(solution_Y); /* we start testing from the first linear relation encountered in the matrix */ for (j = nb_linear_relations; j > 0; j--) { PRINT(my_rank, "Trying %d..\n", nb_linear_relations - j + 1); mpz_set_ui(solution_X, 1); mpz_set_ui(solution_Y, 1); get_identity_row(solution_z, &matrix, nb_qr_primes + NB_VECTORS_OFFSET - j + 1); for (i = 0; i < nb_qr_primes; i++) { if (mpz_tstbit(solution_z, i)) { mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x); mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */ mpz_mul(solution_Y, solution_Y, smooth_numbers[i].value_x_squared); /*TODO: handling huge stuff here, there is no modulo N like in the solution_X case! * eliminate squares as long as you go*/ } } mpz_sqrt(solution_Y, solution_Y); mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */ mpz_sub(solution_X, solution_X, solution_Y); mpz_gcd(solution_X, solution_X, N); if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */ break; } mpz_cdiv_q(solution_Y, N, solution_X); PRINT(my_rank, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); PRINT( my_rank, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N)); APPEND_TO_LOG_FILE(s); sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y)); APPEND_TO_LOG_FILE(s); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); sprintf(s, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); APPEND_TO_LOG_FILE(s); STOP_TIMER_PRINT_TIME("\nFactorizing done"); } PRINT(my_rank, "%s", "\nCleaning memory..\n"); /********************** clear the x_squared array **********************/ for (i = 0; i < SIEVING_STEP; i++) { mpz_clear(x_squared[i].value_x); mpz_clear(x_squared[i].value_x_squared); //free(x_squared[i].factors_exp); mpz_clear(x_squared[i].factors_vect); } free(x_squared); /********************** clear the x_squared array **********************/ free(modular_roots); /********************** clear the smooth_numbers array **********************/ if (my_rank == 0) { for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) { mpz_clear(smooth_numbers[i].value_x); mpz_clear(smooth_numbers[i].value_x_squared); mpz_clear(smooth_numbers[i].factors_vect); //free(smooth_numbers[i].factors_exp); } free(smooth_numbers); } else { for (i = 0; i < 500; i++) { mpz_clear(temp_slaves_smooth_numbers[i].value_x); mpz_clear(temp_slaves_smooth_numbers[i].value_x_squared); mpz_clear(temp_slaves_smooth_numbers[i].factors_vect); } free(temp_slaves_smooth_numbers); } /********************** clear the smooth_numbers array **********************/ free(primes); /********************** clear mpz _t **********************/mpz_clear(B); mpz_clear(N); sqrtN, rem; mpz_clear(x); mpz_clear(sieving_index); mpz_clear(next_sieving_index); mpz_clear(p); mpz_clear(str); /********************** clear mpz _t **********************/ free_matrix(&matrix); gettimeofday(&end_global, NULL); timersub(&end_global, &start_global, &elapsed); PRINT(my_rank, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000); show_mem_usage(); MPI_Finalize(); return 0; }
int main() { /*FILE *f; char fn[] = "data.txt"; f = fopen(fn, "w"); if(f == NULL) { printf("Error: FILE!\n"); exit(1); }*/ //TESTING //TEST - STANDARD COPY, COMPRESS, DECOMPRESS, MULTIPLICATION Matrix* test_mtx_1 = init_matrix(1000, 1000); f2(test_mtx_1, 1, 1, 2); Matrix* test_mtx_2 = copy_matrix(test_mtx_1); test(assert_matrix(test_mtx_1, test_mtx_2), "Copy matrix"); CRS* test_crs_1 = cp_crs(test_mtx_1); CCS* test_ccs_1 = cp_ccs(test_mtx_2); Matrix* test_ucp_mtx_1 = uncp_crs(test_crs_1); Matrix* test_ucp_mtx_2 = uncp_ccs(test_ccs_1); test(assert_matrix(test_ucp_mtx_1, test_mtx_1), "Decompress crs"); test(assert_matrix(test_ucp_mtx_2, test_mtx_2), "Decompress ccs"); test(assert_matrix(test_ucp_mtx_1, test_ucp_mtx_2), "Decompress matrix"); Vector* standard_vector = gen_vector(1000, 0.1, 1); Vector* crs_product = mtp_crs(test_crs_1, standard_vector); Vector* ccs_product = mtp_ccs(test_ccs_1, standard_vector); test(assert_vector(crs_product, ccs_product), "CCS and CRS product"); //TEST - CLEAN free_matrix(test_mtx_1); free_matrix(test_mtx_2); free_ccs(test_ccs_1); free_crs(test_crs_1); free_vector(standard_vector); free_vector(crs_product); free_vector(ccs_product); free_matrix(test_ucp_mtx_1); free_matrix(test_ucp_mtx_2); //TESTING //TEST - CRS PARALLEL PRODUCT Matrix* test_mtx = init_matrix(100, 100); f2(test_mtx, 1, 1, 2); CRS* standard_test_crs = cp_crs(test_mtx); Vector* vector = gen_vector(100, 0.1, 1); free_matrix(test_mtx); Vector* standard_product = mtp_crs(standard_test_crs, vector); Vector* openmp_product = openmp_mtp_crs(standard_test_crs, vector); test(assert_vector(standard_product, openmp_product), "CRS openmp product validation"); free_vector(openmp_product); Vector* pthread_product = pthread_mtp_crs(standard_test_crs, vector); test(assert_vector(standard_product, pthread_product), "CRS pthread product validation"); free_vector(pthread_product); //TEST - CLEAN //free_matrix(test_mtx); free_crs(standard_test_crs); free_vector(vector); free_vector(standard_product); /* //_________________________________________________________________________________ //TEST - SPEED Matrix* mtx_speed = init_matrix(10,10); f2(mtx_speed,1,1,2); CRS* crs_speed = cp_crs(mtx_speed); CCS* ccs_speed = cp_ccs(mtx_speed); Vector* vector_speed = gen_vector(10, 0.1, 1); printf("\nStandard ccs product\n"); init_stoper(); Vector* mtp_ccs_product = mtp_ccs(ccs_speed, vector_speed); print_stoper(); printf("\nStandard crs product\n"); init_stoper(); Vector* mtp_crs_product = mtp_crs(crs_speed, vector_speed); print_stoper(); //TEST - PRODUCT VALIDATION test(assert_vector(mtp_ccs_product, mtp_crs_product), "ccs and crs product"); //TEST - CLEAN free_crs(crs_speed); free_ccs(ccs_speed); //TEST - OPENMP PRODUCT CRS* openmp_crs_speed = cp_crs(mtx_speed); printf("\nopenmp crs product\n"); init_stoper(); Vector* openmp_mtp_crs_product = openmp_mtp_crs(openmp_crs_speed, vector_speed); print_stoper(); //TEST - OPENMP PRODUCT VALIDATION test(assert_vector(openmp_mtp_crs_product, mtp_crs_product), "openmp product validation"); //TEST - OPENMP CLEAN free_crs(openmp_crs_speed); free_vector(openmp_mtp_crs_product); //TEST - PTHREAD PRODUCT CRS* pthread_crs_speed = cp_crs(mtx_speed); printf("\npthread crs product\n"); init_stoper(); Vector* pthread_mtp_crs_product = pthread_mtp_crs(pthread_crs_speed, vector_speed); print_stoper(); //TEST - PTHREAD PRODUCT VALIDATION test(assert_vector(pthread_mtp_crs_product, mtp_crs_product), "pthread product validation"); //TEST PTHREAD CLEAN free_crs(pthread_crs_speed); free_vector(pthread_mtp_crs_product); //TEST - MPI PRODUCT CRS* mpi_crs_speed = copy_crs(crs_speed); printf("\nmpi crs product\n"); init_stoper(); Vector* mpi_mtp_crs_product = mpi_mtp_crs(mpi_crs_speed, vector_speed); print_stoper(); //TEST - MPI PRODUCT VALIDATION test(assert_vector(mpi_mtp_crs_product, mtp_crs_product), "mpi product validation"); //TEST - MPI CLEAN free_crs(mpi_crs_speed); free_vector(mpi_mtp_crs_product); */ return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing sswap, sswapblk, slaswp, slaswpx */ int main( int argc, char** argv) { TESTING_INIT(); float *h_A1, *h_A2; float *h_R1, *h_R2; magmaFloat_ptr d_A1, d_A2; // row-major and column-major performance real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN; real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN; real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN; real_Double_t row_perf4 = MAGMA_D_NAN; real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN; real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN; real_Double_t row_perf7 = MAGMA_D_NAN; real_Double_t cpu_perf = MAGMA_D_NAN; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magmaInt_ptr d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" %8s sswap sswap sswapblk slaswp slaswp2 slaswpx scopymatrix CPU (all in )\n", g_platform_str ); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj/col-maj row-blk/col-blk slaswp (GByte/s)\n"); printf("=========================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(float) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, float, lda*N ); TESTING_MALLOC_PIN( h_A2, float, lda*N ); TESTING_MALLOC_PIN( h_R1, float, lda*N ); TESTING_MALLOC_PIN( h_R2, float, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, float, ldda*N ); TESTING_MALLOC_DEV( d_A2, float, ldda*N ); // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based // some implementations (e.g., MacOS dlaswp) assume this for( j=0; j < nb; j++ ) { ipiv[j] = (rand() % (N-j)) + j + 1; assert( ipiv[j] >= j+1 ); assert( ipiv[j] <= N ); } /* ===================================================================== * cublas / clBLAS / Xeon Phi sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasSswap( opts.handle, N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1 ); #else magma_sswap( N, d_A1, ldda*j, 1, d_A2, ldda*(ipiv[j]-1), 1, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { #ifdef HAVE_CUBLAS cublasSswap( opts.handle, N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); #else magma_sswap( N, d_A1, j, ldda, d_A2, ipiv[j]-1, ldda, opts.queue ); #endif } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * sswapblk, blocked version (2 matrices) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_sswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswp (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix) */ #ifdef HAVE_CUBLAS /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; #endif /* LAPACK swap on CPU for comparison */ time = magma_wtime(); lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; #ifdef HAVE_CUBLAS magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; #endif /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf4, ((check & 0x040) != 0 ? '*' : ' '), row_perf7, ((check & 0x080) != 0 ? '*' : ' '), row_perf5, ((check & 0x100) != 0 ? '*' : ' '), col_perf5, ((check & 0x200) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
void eval(char *cmd) { if (cmd[0] == '\0') return; char *af = strchr(cmd, ' '); if (af != NULL) *af = '\0'; if (strcmp(cmd, "init") == 0) { int r, c; if (af != NULL) if (sscanf(af + 1, "%d %d", &r, &c) == 2) { init_matrix(r, c); return; } wprintw(work_wnd, "Usage: init nrows ncolumns\n"); return; } if (strcmp(cmd, "randomize") == 0) { int l, h; if (af != NULL) if (sscanf(af + 1, "%d %d", &l, &h) == 2) { randomize(l, h); return; } wprintw(work_wnd, "Usage: randomize min max\n"); return; } if (strcmp(cmd, "mutate") == 0) { int r, c, nv; if (af != NULL) if (sscanf(af + 1, "%d %d %d", &r, &c, &nv) == 3) { mutate(r, c, nv); return; } wprintw(work_wnd, "Usage: mutate row column new_value\n"); return; } if (strcmp(cmd, "null") == 0) { tonull(); return; } if (strcmp(cmd, "sumDown") == 0) { sumDown(); return; } if (strcmp(cmd, "reflectSide") == 0) { transposeSide(); return; } if (strcmp(cmd, "rotate") == 0) { rotate(); return; } if (strcmp(cmd, "flipH") == 0) { flipH(); return; } if (strcmp(cmd, "avg") == 0) { avg(); return; } if (strcmp(cmd, "feswap") == 0) { feswap(); return; } if (strcmp(cmd, "leswap") == 0) { leswap(); return; } if (strcmp(cmd, "ecswap") == 0) { ecswap(); return; } if (strcmp(cmd, "q") == 0) { finish(); exit(EXIT_SUCCESS); return; } if (strcmp(cmd, "sumC") == 0) { size_t column; if (af != NULL) if (sscanf(af + 1, "%zu", &column) == 1) { sumColumn(column); return; } wprintw(work_wnd, "Usage: sumC column\n"); return; } print_help(); }
int main(int argc, char **argv) { pthread_t thr[COLS1]; pthread_t sumThread; init_matrix(); // Barrier initialization if(pthread_barrier_init(&barr, NULL, COLS1+1)){ printf("Could not create a barrier\n"); return -1; } int i; for(i = 0; i < COLS1; ++i){ if(pthread_create(&thr[i], NULL, &multiplica, (void*)i)){ printf("Could not create thread %d\n", i); return -1; } } if(pthread_create(&sumThread, NULL, &soma, NULL)){ printf("Could not create thread %d\n", i); return -1; } for(i = 0; i < COLS1; ++i){ if(pthread_join(thr[i], NULL)){ printf("Could not join thread %d\n", i); return -1; } } if(pthread_join(sumThread, NULL)){ printf("Could not join thread %d\n", i); return -1; } int r,c; printf("Matriz 1:\n"); for(r=0;r<ROWS1;r++){ for(c=0;c<COLS1;c++){ printf("%d ",matrix1[r][c]); } printf("\n"); } printf("Matriz 2:\n"); for(r=0;r<ROWS2;r++){ for(c=0;c<COLS2;c++){ printf("%d ",matrix2[r][c]); } printf("\n"); } printf("Matriz Final:\n"); for(r=0;r<ROWS1;r++){ for(c=0;c<COLS2;c++){ printf("%d ",matrix_final[r][c]); } printf("\n"); } printf("TERMINOU!\n"); return 0; }
int main(int argc, char** argv) { int rank, size; int N; char opt; int nt = -1; int max_threads = 16; // on jupiter bool id = false; algo_t algo = reduce_scatter; FILE *f = NULL; static const char optstring[] = "n:a:f:i:p:"; static const struct option long_options[] = { {"n", 1, NULL, 'n'}, {"file", 1, NULL, 'f'}, {"i", 1, NULL, 'i'}, {NULL, 0, NULL, 0} }; MPI_Init(&argc,&argv); // get rank and size from communicator MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); while ((opt = getopt_long(argc, argv, optstring, long_options, NULL)) != EOF) { switch(opt) { case 'i': if (strcmp("procs", optarg) == 0) { id = true; } break; case 'p': nt = atoi(optarg); if (nt > max_threads) { printf("Using too much procs %d, use max %d", nt, max_threads); return EXIT_FAILURE; } else { printf("Using %d procs.", nt); } case 'n': N = atoi(optarg); break; case 'f': f = fopen(optarg,"a"); if (f == NULL) { mpi_printf(root, "Could not open log file '%s': %s\n", optarg, strerror(errno)); MPI_Finalize(); return EXIT_FAILURE; } break; case 'a': if (strcmp("ref", optarg) == 0) { mpi_printf(root, "Using reference implementation \n"); algo = ref; } else if ((strcmp("reduce_scatter", optarg) == 0)) { mpi_printf(root, "Using MPI_Allgather implementation \n"); algo = reduce_scatter; } break; default: MPI_Finalize(); return EXIT_FAILURE; } } if(N == 0) { if ( rank == root ){ printf("Usage: mpirun -nn nodecount p3-reduce_scatter.exe -n N\n"); printf("N is the the matrix size. \n\n"); } return 1; } /* ======================================================== */ /* Initialisation matrix & vector */ ATYPE *matrix = NULL; ATYPE *vector = NULL; if (rank == root) { debug("Setting up root data structures"); matrix = init_matrix(N,1); vector = init_vector(N,1); } int colcnt = N - (N/size ) * (size - 1 ); int partition = N/size; ATYPE *local_matrix = NULL; local_matrix = (ATYPE*) malloc (sizeof(ATYPE) * N * colcnt); ATYPE *local_vector = NULL; local_vector = (ATYPE*) malloc (sizeof(ATYPE) * partition) ; ATYPE *reference = NULL; reference = init_vector(N,1); ATYPE *result = NULL; result = init_vector(N,1); double inittime,totaltime; if( algo == ref) { if (rank == root) { inittime = MPI_Wtime(); matrix_vector_mult_ref(matrix, vector, N, reference); totaltime = MPI_Wtime() - inittime; } } else if (algo == reduce_scatter) { if(rank == root){ debug("Comptuting reference"); matrix_vector_mult_ref(matrix, vector, N, reference); } MPI_Barrier(MPI_COMM_WORLD); /* ======================================================== */ /* distributing matrix and vector */ distribute_vector(vector, local_vector, rank, size, partition, N); distribute_matrix(matrix, local_matrix, rank, size, partition, N); debug("begin MPI_Reduce_scatter"); MPI_Barrier(MPI_COMM_WORLD); inittime = MPI_Wtime(); compute_reduce_scatter(local_matrix, local_vector, result, rank, size, N, partition); MPI_Barrier(MPI_COMM_WORLD); totaltime = MPI_Wtime() - inittime; double localtime = totaltime; MPI_Reduce(&localtime, &totaltime, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); debug("after MPI_Reduce_scatter"); /* TODO: fix test so it uses vector idea */ /* debug("Testing result"); */ /* if (test_vector_part(result, local_vector, (rank * partition) , partition)) { */ /* debug("testresult: OK"); */ /* } else { */ /* debug("testresult: FAILURE"); */ /* debug("Result:"); */ /* printArray(recvbuff, N); */ /* debug("Reference:"); */ /* printArray(reference,N); */ /* } */ MPI_Barrier(MPI_COMM_WORLD); } if (rank == 0) { if (f != NULL) { if (id) { fprintf(f,"%d,%lf\n",nt, totaltime); } else { fprintf(f,"%d,%lf\n",N, totaltime); } } if (id) { printf("%d,%lf\n",nt , totaltime); } else { printf("%d,%lf\n",N , totaltime); } } debug("cleaning up"); free(vector); free(matrix); MPI_Finalize(); if ( f != NULL) { fclose(f); } return 0; }
static int test_transform_function( transform_func func, int psize, int mtype, unsigned long *cycles ) { GLvector4f source[1], dest[1], ref[1]; GLmatrix mat[1]; GLfloat *m; int i, j; #ifdef RUN_DEBUG_BENCHMARK int cycle_i; /* the counter for the benchmarks we run */ #endif (void) cycles; if ( psize > 4 ) { _mesa_problem( NULL, "test_transform_function called with psize > 4\n" ); return 0; } mat->m = (GLfloat *) _mesa_align_malloc( 16 * sizeof(GLfloat), 16 ); mat->type = mtypes[mtype]; m = mat->m; ASSERT( ((long)m & 15) == 0 ); init_matrix( m ); for ( i = 0 ; i < 4 ; i++ ) { for ( j = 0 ; j < 4 ; j++ ) { switch ( templates[mtype][i * 4 + j] ) { case NIL: m[j * 4 + i] = 0.0; break; case ONE: m[j * 4 + i] = 1.0; break; case NEG: m[j * 4 + i] = -1.0; break; case VAR: break; default: ASSERT(0); return 0; } } } for ( i = 0 ; i < TEST_COUNT ; i++) { ASSIGN_4V( d[i], 0.0, 0.0, 0.0, 1.0 ); ASSIGN_4V( s[i], 0.0, 0.0, 0.0, 1.0 ); for ( j = 0 ; j < psize ; j++ ) s[i][j] = rnd(); } source->data = (GLfloat(*)[4])s; source->start = (GLfloat *)s; source->count = TEST_COUNT; source->stride = sizeof(s[0]); source->size = 4; source->flags = 0; dest->data = (GLfloat(*)[4])d; dest->start = (GLfloat *)d; dest->count = TEST_COUNT; dest->stride = sizeof(float[4]); dest->size = 0; dest->flags = 0; ref->data = (GLfloat(*)[4])r; ref->start = (GLfloat *)r; ref->count = TEST_COUNT; ref->stride = sizeof(float[4]); ref->size = 0; ref->flags = 0; ref_transform( ref, mat, source ); if ( mesa_profile ) { BEGIN_RACE( *cycles ); func( dest, mat->m, source ); END_RACE( *cycles ); } else { func( dest, mat->m, source ); } for ( i = 0 ; i < TEST_COUNT ; i++ ) { for ( j = 0 ; j < 4 ; j++ ) { if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) { printf("-----------------------------\n" ); printf("(i = %i, j = %i)\n", i, j ); printf("%f \t %f \t [diff = %e - %i bit missed]\n", d[i][0], r[i][0], r[i][0]-d[i][0], MAX_PRECISION - significand_match( d[i][0], r[i][0] ) ); printf("%f \t %f \t [diff = %e - %i bit missed]\n", d[i][1], r[i][1], r[i][1]-d[i][1], MAX_PRECISION - significand_match( d[i][1], r[i][1] ) ); printf("%f \t %f \t [diff = %e - %i bit missed]\n", d[i][2], r[i][2], r[i][2]-d[i][2], MAX_PRECISION - significand_match( d[i][2], r[i][2] ) ); printf("%f \t %f \t [diff = %e - %i bit missed]\n", d[i][3], r[i][3], r[i][3]-d[i][3], MAX_PRECISION - significand_match( d[i][3], r[i][3] ) ); return 0; } } } _mesa_align_free( mat->m ); return 1; }
static int test_norm_function( normal_func func, int mtype, long *cycles ) { GLvector4f source[1], dest[1], dest2[1], ref[1], ref2[1]; GLmatrix mat[1]; GLfloat s[TEST_COUNT][5], d[TEST_COUNT][4], r[TEST_COUNT][4]; GLfloat d2[TEST_COUNT][4], r2[TEST_COUNT][4], length[TEST_COUNT]; GLfloat scale; GLfloat *m; int i, j; #ifdef RUN_DEBUG_BENCHMARK int cycle_i; /* the counter for the benchmarks we run */ #endif (void) cycles; mat->m = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 ); mat->inv = m = mat->m; init_matrix( m ); scale = 1.0F + rnd () * norm_scale_types[mtype]; for ( i = 0 ; i < 4 ; i++ ) { for ( j = 0 ; j < 4 ; j++ ) { switch ( norm_templates[mtype][i * 4 + j] ) { case NIL: m[j * 4 + i] = 0.0; break; case ONE: m[j * 4 + i] = 1.0; break; case NEG: m[j * 4 + i] = -1.0; break; case VAR: break; default: _mesa_exit(1); } } } for ( i = 0 ; i < TEST_COUNT ; i++ ) { ASSIGN_3V( d[i], 0.0, 0.0, 0.0 ); ASSIGN_3V( s[i], 0.0, 0.0, 0.0 ); ASSIGN_3V( d2[i], 0.0, 0.0, 0.0 ); for ( j = 0 ; j < 3 ; j++ ) s[i][j] = rnd(); length[i] = 1 / SQRTF( LEN_SQUARED_3FV( s[i] ) ); } source->data = (GLfloat(*)[4]) s; source->start = (GLfloat *) s; source->count = TEST_COUNT; source->stride = sizeof(s[0]); source->flags = 0; dest->data = d; dest->start = (GLfloat *) d; dest->count = TEST_COUNT; dest->stride = sizeof(float[4]); dest->flags = 0; dest2->data = d2; dest2->start = (GLfloat *) d2; dest2->count = TEST_COUNT; dest2->stride = sizeof(float[4]); dest2->flags = 0; ref->data = r; ref->start = (GLfloat *) r; ref->count = TEST_COUNT; ref->stride = sizeof(float[4]); ref->flags = 0; ref2->data = r2; ref2->start = (GLfloat *) r2; ref2->count = TEST_COUNT; ref2->stride = sizeof(float[4]); ref2->flags = 0; if ( norm_normalize_types[mtype] == 0 ) { ref_norm_transform_rescale( mat, scale, source, NULL, ref ); } else { ref_norm_transform_normalize( mat, scale, source, NULL, ref ); ref_norm_transform_normalize( mat, scale, source, length, ref2 ); } if ( mesa_profile ) { BEGIN_RACE( *cycles ); func( mat, scale, source, NULL, dest ); END_RACE( *cycles ); func( mat, scale, source, length, dest2 ); } else { func( mat, scale, source, NULL, dest ); func( mat, scale, source, length, dest2 ); } for ( i = 0 ; i < TEST_COUNT ; i++ ) { for ( j = 0 ; j < 3 ; j++ ) { if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) { _mesa_printf( "-----------------------------\n" ); _mesa_printf( "(i = %i, j = %i)\n", i, j ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d[i][0], r[i][0], r[i][0]/d[i][0], MAX_PRECISION - significand_match( d[i][0], r[i][0] ) ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d[i][1], r[i][1], r[i][1]/d[i][1], MAX_PRECISION - significand_match( d[i][1], r[i][1] ) ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d[i][2], r[i][2], r[i][2]/d[i][2], MAX_PRECISION - significand_match( d[i][2], r[i][2] ) ); return 0; } if ( norm_normalize_types[mtype] != 0 ) { if ( significand_match( d2[i][j], r2[i][j] ) < REQUIRED_PRECISION ) { _mesa_printf( "------------------- precalculated length case ------\n" ); _mesa_printf( "(i = %i, j = %i)\n", i, j ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d2[i][0], r2[i][0], r2[i][0]/d2[i][0], MAX_PRECISION - significand_match( d2[i][0], r2[i][0] ) ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d2[i][1], r2[i][1], r2[i][1]/d2[i][1], MAX_PRECISION - significand_match( d2[i][1], r2[i][1] ) ); _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n", d2[i][2], r2[i][2], r2[i][2]/d2[i][2], MAX_PRECISION - significand_match( d2[i][2], r2[i][2] ) ); return 0; } } } } ALIGN_FREE( mat->m ); return 1; }
VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C) { viennacl::backend::mem_handle A_handle; viennacl::backend::mem_handle B_handle; viennacl::backend::mem_handle C_handle; if (init_matrix(A_handle, A) != ViennaCLSuccess) return ViennaCLGenericFailure; if (init_matrix(B_handle, B) != ViennaCLSuccess) return ViennaCLGenericFailure; if (init_matrix(C_handle, C) != ViennaCLSuccess) return ViennaCLGenericFailure; switch (A->precision) { case ViennaCLFloat: { typedef viennacl::matrix_base<float>::size_type size_type; typedef viennacl::matrix_base<float>::difference_type difference_type; viennacl::matrix_base<float> mat_A(A_handle, size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1), size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor); viennacl::matrix_base<float> mat_B(B_handle, size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1), size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor); viennacl::matrix_base<float> mat_C(C_handle, size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1), size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans) viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float); else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans) viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float); else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans) viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float); else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans) viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float); else return ViennaCLGenericFailure; return ViennaCLSuccess; } case ViennaCLDouble: { typedef viennacl::matrix_base<double>::size_type size_type; typedef viennacl::matrix_base<double>::difference_type difference_type; viennacl::matrix_base<double> mat_A(A_handle, size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1), size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor); viennacl::matrix_base<double> mat_B(B_handle, size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1), size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor); viennacl::matrix_base<double> mat_C(C_handle, size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1), size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans) viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double); else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans) viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double); else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans) viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double); else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans) viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double); else return ViennaCLGenericFailure; return ViennaCLSuccess; } default: return ViennaCLGenericFailure; } }
int main(int argc, char **argv) { int rank; int world_size; /* * Initialization */ int thread_support; if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) { fprintf(stderr,"MPI_Init_thread failed\n"); exit(1); } if (thread_support == MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); if (thread_support < MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI does not have thread support!\n"); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); starpu_srand48((long int)time(NULL)); parse_args(rank, argc, argv); int ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* We disable sequential consistency in this example */ starpu_data_set_default_sequential_consistency_flag(0); starpu_mpi_init(NULL, NULL, 0); STARPU_ASSERT(p*q == world_size); starpu_cublas_init(); int barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); /* * Problem Init */ init_matrix(rank); fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank, (int)(allocated_memory/(1024*1024)), (int)(allocated_memory_extra/(1024*1024)), (int)((allocated_memory+allocated_memory_extra)/(1024*1024))); display_grid(rank, nblocks); TYPE *a_r = NULL; // STARPU_PLU(display_data_content)(a_r, size); TYPE *x, *y; if (check) { x = calloc(size, sizeof(TYPE)); STARPU_ASSERT(x); y = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { unsigned ind; for (ind = 0; ind < size; ind++) x[ind] = (TYPE)starpu_drand48(); } a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks); if (rank == 0) STARPU_PLU(display_data_content)(a_r, size); // STARPU_PLU(compute_ax)(size, x, y, nblocks, rank); } barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size); /* * Report performance */ int reduce_ret; double min_timing = timing; double max_timing = timing; double sum_timing = timing; reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); if (rank == 0) { fprintf(stderr, "Computation took: %f ms\n", max_timing/1000); fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000); fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000); fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000)); unsigned n = size; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f)); } /* * Test Result Correctness */ if (check) { /* * Compute || A - LU || */ STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r); #if 0 /* * Compute || Ax - LUx || */ unsigned ind; y2 = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { for (ind = 0; ind < size; ind++) { y2[ind] = (TYPE)0.0; } } STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank); /* Compute y2 = y2 - y */ CPU_AXPY(size, -1.0, y, 1, y2, 1); TYPE err = CPU_ASUM(size, y2, 1); int max = CPU_IAMAX(size, y2, 1); fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size)); fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]); #endif } /* * Termination */ barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); starpu_cublas_shutdown(); starpu_mpi_shutdown(); starpu_shutdown(); #if 0 MPI_Finalize(); #endif return 0; }
VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B) { viennacl::backend::mem_handle A_handle; viennacl::backend::mem_handle B_handle; if (init_matrix(A_handle, A) != ViennaCLSuccess) return ViennaCLGenericFailure; if (init_matrix(B_handle, B) != ViennaCLSuccess) return ViennaCLGenericFailure; switch (A->precision) { case ViennaCLFloat: { typedef viennacl::matrix_base<float>::size_type size_type; typedef viennacl::matrix_base<float>::difference_type difference_type; viennacl::matrix_base<float> mat_A(A_handle, size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1), size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor); viennacl::matrix_base<float> mat_B(B_handle, size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1), size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } return ViennaCLSuccess; } case ViennaCLDouble: { typedef viennacl::matrix_base<double>::size_type size_type; typedef viennacl::matrix_base<double>::difference_type difference_type; viennacl::matrix_base<double> mat_A(A_handle, size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1), size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor); viennacl::matrix_base<double> mat_B(B_handle, size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1), size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor); if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans) { if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag()); else if (uplo == ViennaCLUpper && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag()); else if (uplo == ViennaCLLower && diag == ViennaCLUnit) viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag()); else return ViennaCLGenericFailure; } return ViennaCLSuccess; } default: return ViennaCLGenericFailure; } }
int main(void) { crosslist one,two,three; int choice;//as a mark of selection char flag;//selection mark while(1) { system("cls"); system("color 81"); system("mode con cols=80 lines=400"); system("title #Crosslist To Deal With Sparse Matrix#"); printf("\t@*************************************************************@\n"); putchar('\n'); printf("\t\t %c----------稀疏矩阵-应用程序系统----------%c\n",2,2); putchar('\n'); printf("\t@*************************************************************@\n"); printf("\t$*************************************************************$\n"); printf("\t\t %c----------------功能选择-----------------%c\n",2,2); putchar('\n'); printf("\t\t %c-----------------------------------------%c\n",2,2); printf("\t\t %c <1> 稀疏矩阵的加法运算 %c\n",2,2); printf("\t\t %c-----------------------------------------%c\n",2,2); printf("\t\t %c <2> 稀疏矩阵的减法运算 %c\n",2,2); printf("\t\t %c-----------------------------------------%c\n",2,2); printf("\t\t %c <3> 稀疏矩阵的乘法运算 %c\n",2,2); printf("\t\t %c-----------------------------------------%c\n",2,2); printf("\t\t %c <4> 退出应用程序 %c\n",2,2); printf("\t\t %c-----------------------------------------%c\n",2,2); putchar('\n'); printf("\t\t %c-----------矩阵以行序为主序-----------%c\n",2,2); printf("\t$*************************************************************$\n"); printf("\t\t!!注意:如果想终止程序,请按 Ctrl +C\n"); printf("\t$*************************************************************$\n\n"); printf("请输入你的选择:(1--4)\n"); fflush(stdin);//清空输入缓冲区 printf("你的选择是:"); scanf("%d",&choice); putchar('\n'); switch(choice) { case 1: printf("\t<加法运算>\n"); putchar('\n'); init_matrix(one);//初始化矩阵one printf("\t<建立第一个矩阵>\n"); creat_matrix(one); putchar('\n'); printf("第一个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(one); printf("-------------------------------------------------\n"); init_matrix(two);//初始化矩阵two putchar('\n'); printf("\t<建立第二个矩阵>\n"); creat_matrix(two); putchar('\n'); printf("第二个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(two); printf("-------------------------------------------------\n"); /*add the two matrix*/ putchar('\n'); printf("两个矩阵相加\n"); init_matrix(three);//初始化矩阵three putchar('\n'); add_matrix(one,two,three); printf("结果如下:\n"); printf("-------------------------------------------------\n"); Sleep(1000); print_matrix(three); printf("-------------------------------------------------\n"); system("pause"); break; case 2: printf("\t<减法运算>\n"); putchar('\n'); init_matrix(one);//初始化矩阵one printf("\t<建立第一个矩阵>\n"); creat_matrix(one); putchar('\n'); printf("第一个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(one); printf("-------------------------------------------------\n"); init_matrix(two);//初始化矩阵two putchar('\n'); printf("\t<建立第二个矩阵>\n"); creat_matrix(two); putchar('\n'); printf("第二个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(two); printf("-------------------------------------------------\n"); /*add the two matrix*/ putchar('\n'); printf("两个矩阵相减\n"); init_matrix(three);//初始化矩阵three putchar('\n'); opposite_matrix(two); add_matrix(one,two,three); printf("结果如下:\n"); printf("-------------------------------------------------\n"); Sleep(1000); print_matrix(three); printf("-------------------------------------------------\n"); system("pause"); break; case 3: printf("\t<乘法运算>\n"); putchar('\n'); init_matrix(one);//初始化矩阵one putchar('\n'); printf("\t<建立第一个矩阵>\n"); creat_matrix(one); putchar('\n'); printf("第一个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(one); printf("-------------------------------------------------\n"); init_matrix(two);//初始化矩阵two putchar('\n'); printf("\t<建立第二个矩阵>\n"); creat_matrix(two); putchar('\n'); printf("第二个矩阵如下:\n"); printf("-------------------------------------------------\n"); print_matrix(two); printf("-------------------------------------------------\n"); /*multiply the two matrix*/ putchar('\n'); printf("两个矩阵相乘\n"); init_matrix(three); multi_matrix(one,two,three); printf("结果如下:\n"); printf("-------------------------------------------------\n"); Sleep(1000); print_matrix(three); printf("-------------------------------------------------\n"); system("pause"); break; case 4: printf("你确定退出程序吗<Y/N>?\n"); fflush(stdin); scanf("%c",&flag); if(flag=='y'||flag=='Y'||flag=='\n') { printf("\t\t%c-------%c-------%c-------%c-------%c\n",2,2,2,2,2); putchar('\n'); printf("\t\t(^_^)谢谢使用!(^_^)\n"); putchar('\n'); printf("\t\t%c-------%c-------%c-------%c-------%c\n",2,2,2,2,2); putchar('\n'); Sleep(2000); exit(1); } else { printf("………欢迎继续使用………\n"); Sleep(2000); } break; default:printf("请输入有效的选择 1 ~ 4!\n"); Sleep(2000); break; }//switch }//while return 1; }
int main(int argc, char * argv[]) { pthread_t *tid;//number of thread args *arg; int total_processes; double *a, *b, *x; int res1, res2; long int t; int n; int N; int i; char * filename = 0; const char * name = "c.txt"; if( argc != 3 && argc != 4 ) { printf("Usage : %s <n> <total_processes> <filename>\n", argv[0]); return 0; } n = atoi(argv[1]);//from number to string total_processes = atoi (argv[2]); if(!n || !total_processes) { printf("Usage : %s <n> <total_processes> <filename>\n", argv[0]); return 0; } a = new double[n*n]; b = new double[n]; x = new double[n]; tid = new pthread_t[total_processes]; arg = new args[total_processes]; if(argc > 3) filename = argv[3]; if(filename) { res1 = read_matrix(a, n, "a.txt"); res2 = read_vector(b, n, "b.txt"); if(res1 || res2) { printf("cannot read from file\n"); delete [] tid; delete [] arg; delete [] a; delete [] b; delete [] x; return 1; } } else { init_matrix(a, n); init_vector(b, a, n); } printf("matrix A:\n"); print_matrix(a, n); printf("vector b:\n"); print_vector(b, n); for (i = 0; i < total_processes; i++) { arg[i].a = a; arg[i].b = b; arg[i].n = n; arg[i].total_processes = total_processes; arg[i].num_process = i; arg[i].error = 0; } t = get_full_time (); for (i = 0; i < total_processes; i++) { if (pthread_create (tid + i, 0, &thread_method_of_reflections, arg + i)) { printf ("Cannot create thread %d\n", i); return 2; } } for (i = 0; i < total_processes; i++) pthread_join (tid[i], 0); back_hod(a, b, x, n); t = get_full_time () - t; N = (n < MAX_N) ? n : MAX_N; printf("result : "); for(i = 0; i < N; i++) printf("%lg ", x[i]); printvectorfile(x,n,name); if(filename) { read_matrix(a, n, "a.txt"); read_vector(b, n, "b.txt"); printf("\nResidual = %le\nElapsed time = %Lg\n",SolutionError(n,a,b,x),(long double)t/(CLOCKS_PER_SEC)); } else { init_matrix(a, n); init_vector(b, a, n); printf("\nResidual = %le\nError = %le\nElapsed time = %Lg\n",SolutionError(n,a,b,x), SolutionAccuracy(n,x),(long double)t/(CLOCKS_PER_SEC)); } delete [] tid; delete [] arg; delete [] a; delete [] b; delete [] x; return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dgetrf_mgpu */ int main( int argc, char** argv ) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; double error; double *h_A; magmaDouble_ptr d_lA[ MagmaMaxGPUs ]; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, n_local, ngpu; magma_int_t info, min_mn, nb, ldn_local; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; nb = magma_get_dgetrf_nb( M ); gflops = FLOPS_DGETRF( M, N ) / 1e9; // ngpu must be at least the number of blocks ngpu = min( opts.ngpu, int((N+nb-1)/nb) ); if ( ngpu < opts.ngpu ) { printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu ); } // Allocate host memory for the matrix TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_CPU( h_A, double, n2 ); // Allocate device memory for( int dev=0; dev < ngpu; dev++ ) { n_local = ((N/nb)/ngpu)*nb; if (dev < (N/nb) % ngpu) n_local += nb; else if (dev == (N/nb) % ngpu) n_local += N % nb; ldn_local = ((n_local+31)/32)*32; // TODO why? magma_setdevice( dev ); TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local ); } /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info ); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb ); gpu_time = magma_wtime(); magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info ); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dgetrf_mgpu returned error %d: %s.\n", (int) info, magma_strerror( info )); magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb ); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf( " ---\n" ); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( h_A ); for( int dev=0; dev < ngpu; dev++ ) { magma_setdevice( dev ); TESTING_FREE_DEV( d_lA[dev] ); } fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
void fem(size_t n, double errors[2], double (*fn_f)(double, double), double (*fn_g)(unsigned char, double, double), double (*fn_u)(double, double)) { mesh m; crs_matrix mat; double * u, * rhs; double local_stiffness[3][3]; double local_load[3]; size_t elem; /* 1. Allocate and generate mesh */ get_mesh(&m, n); #ifdef PRINT_DEBUG print_mesh(&m); #endif /* 2. Allocate the linear system */ init_matrix(&mat, &m); u = (double *) malloc(sizeof(double) * m.n_vertices); if (u == NULL) err_exit("Allocation of solution vector failed!"); memset(u, 0, sizeof(double) * m.n_vertices); rhs = (double *) malloc(sizeof(double) * m.n_vertices); if (rhs == NULL) err_exit("Allocation of right hand side failed!"); memset(rhs, 0, sizeof(double) * m.n_vertices); /* 3. Assemble the matrix */ for (elem = 0; elem < m.n_triangles; ++elem) { /* Compute local stiffness and load */ get_local_stiffness(local_stiffness, &m, elem); get_local_load(local_load, &m, elem, fn_f); #ifdef PRINT_DEBUG print_local_stiffness(local_stiffness); print_local_load(local_load); #endif /* insert into global matrix and rhs */ assemble_local2global_stiffness(local_stiffness, &mat, &m, elem); assemble_local2global_load(local_load, rhs, &m, elem); } #ifdef PRINT_DEBUG printf("Matrix after assembly:\n"); print_matrix(&mat); printf("rhs after assembly:\n"); for(size_t i = 0; i < m.n_vertices; ++i) { printf("%5.2f\n", rhs[i]); } printf("\n"); #endif /* 4. Apply boundary conditions */ apply_dbc(&mat, rhs, &m, fn_g); #ifdef PRINT_DEBUG printf("Matrix after application of BCs:\n"); print_matrix(&mat); printf("rhs after application of BCs:\n"); for(size_t i = 0; i < m.n_vertices; ++i) { printf("%5.2f\n", rhs[i]); } printf("\n"); #endif /* 5. Solve the linear system */ solve(&mat, u, rhs); /* 6. Evaluate error */ errors[0] = l2_norm(u, &m, fn_u); errors[1] = inf_norm(u, &m, fn_u); /* free allocated resources */ free(u); free(rhs); rhs = NULL; free_matrix(&mat); free_mesh(&m); }
/* * Función principal */ int main (int argc, char **argv) { if (argc > 3) { printf("\n%s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); int matrix1_fils = strtol(argv[1], (char **) NULL, 10); int matrix1_cols = strtol(argv[2], (char **) NULL, 10); int matrix2_fils = matrix1_cols; int matrix2_cols = strtol(argv[3], (char **) NULL, 10); // Inicialización de las matrices int i; int **matrix1 = (int **) calloc(matrix1_fils, sizeof(int*)); for (i = 0; i < matrix1_fils; i++){ matrix1[i] = (int *) calloc(matrix1_cols, sizeof(int)); } int **matrix2 = (int **) calloc(matrix2_fils, sizeof(int*)); for (i = 0; i < matrix2_fils; i++){ matrix2[i] = (int *) calloc(matrix2_cols, sizeof(int)); } int **matrixR = (int **) malloc(matrix1_fils * sizeof(int*)); for (i = 0; i < matrix1_fils; i++){ matrixR[i] = (int *) malloc(matrix2_cols * sizeof(int)); } init_matrix(matrix1, matrix1_fils, matrix1_cols); init_matrix(matrix2, matrix2_fils, matrix2_cols); // Bucle principal int j, k, acum; for (j = 0; j < matrix2_cols; j++) { for (i = 0; i < matrix1_fils; i++) { acum = 0; for (k = 0; k < matrix1_cols; k++) { acum += matrix1[i][k] * matrix2[k][j]; } matrixR[i][j] = acum; } } #ifdef DEBUG print_matrix(matrixR, matrix1_fils, matrix2_cols); #endif // Liberamos la memoria utilizada for (i = 0; i < matrix1_fils; i++) { free(matrix1[i]); } free(matrix1); for (i = 0; i < matrix2_fils; i++) { free(matrix2[i]); } free(matrix2); for (i = 0; i < matrix1_fils; i++) { free(matrixR[i]); } free(matrixR); return 0; } fprintf(stderr, "Uso: %s filas_matriz1 columnas_matriz1 columnas_matriz2\n", argv[0]); return -1; }
int main(int argc, char *argv[]) { blocking_entry(); long long int start; long long int end; start = get_micro_clock(); int j, k, noproc, me_no; double sum; double t1, t2; pthread_t *threads; pthread_attr_t pthread_custom_attr; parm *arg; int n, i; if (argc != 3) { printf("Usage: %s n dim\n where n is no. of thread and dim is the size of matrix\n", argv[0]); exit(1); } n = atoi(argv[1]); if ((n < 1) || (n > MAX_THREAD)) { printf("The no of thread should between 1 and %d.\n", MAX_THREAD); exit(1); } NDIM = atoi(argv[2]); pthread_mutex_init(&lock, NULL); init_matrix(&a); init_matrix(&b); init_matrix(&c); for (i = 0; i < NDIM; i++) for (j = 0; j < NDIM; j++) { a[i][j] = i + j; b[i][j] = i + j; } threads = (pthread_t*) malloc(n * sizeof(pthread_t)); pthread_attr_init(&pthread_custom_attr); arg = (parm*) malloc(sizeof(parm) * n); /* setup barrier */ /* Start up thread */ /* Spawn thread */ for (i = 0; i < n; i++) { arg[i].id = i; arg[i].noproc = n; arg[i].dim = NDIM; arg[i].a = a; arg[i].b = b; arg[i].c = c; pthread_create(&threads[i], &pthread_custom_attr, worker, (void*) (arg+i)); } for (i = 0; i < n; i++) { pthread_join(threads[i], NULL); } /* print_matrix(NDIM); */ check_matrix(NDIM); free(arg); end = get_micro_clock(); fprintf(stderr, "> application runtime: %lld microseconds\n", end - start); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { TESTING_INIT(); real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[itest]; N = opts.nsize[itest]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_cgetrf( M, N, h_A, lda, ipiv, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e %s\n", error, (error < tol ? "ok" : "failed")); status += ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
int main(int argn, char** args){ double **U, **V, **P, **F, **G, **RS; int **Flag; char problem[60]; char parameters_filename[60]; char pgm[60]; char output_dirname[60]; double Re, UI, VI, PI, GX, GY, t_end, xlength, ylength, dt, dx, dy, alpha, omg, tau, eps, dt_value, dp; double res = 0, t = 0, n = 0; int imax, jmax, itermax, it; int wl, wr, wt, wb; int timestepsPerPlotting; char old_output_filename[128]; struct dirent *old_outputfile; DIR *output_dir; /* Variables for parallel program */ int iproc, jproc, myrank, il, ir, jb, jt, rank_l, rank_r, rank_b, rank_t, omg_i, omg_j, num_proc; double min_dt; double *bufSend, *bufRecv; double totalTime = 0; struct timespec previousTime, currentTime; MPI_Init(&argn, &args); MPI_Comm_size(MPI_COMM_WORLD, &num_proc); /* Read name of the problem from the command line arguments */ if(argn > 1) { strcpy(problem, args[1]); } else { printf("\n=== ERROR: Please provide the name of the problem\n=== e.g. Run ./sim problem_name if there is a problem_name.dat file.\n\n"); MPI_Finalize(); return 1; } /* Generate input filename based on problem name */ strcpy(parameters_filename, problem); strcat(parameters_filename, ".dat"); /* Read the program configuration file using read_parameters() */ read_parameters(parameters_filename, pgm, &Re, &UI, &VI, &PI, &GX, &GY, &t_end, &xlength, &ylength, &dt, &dx, &dy, &imax, &jmax, &alpha, &omg, &tau, &itermax, &eps, &dt_value, problem, &dp, &wl, &wr, &wt, &wb, ×tepsPerPlotting, &iproc, &jproc); printf("%s\n", pgm); /* Check if the number of processes is correct */ if(iproc * jproc != num_proc) { printf("\n=== ERROR: Number of processes is incorrect (iproc=%d, jproc=%d, -np=%d) ===\n\n", iproc, jproc, num_proc); MPI_Finalize(); return 1; } /* Create folder with the name of the problem */ strcpy(output_dirname, problem); strcat(output_dirname, "/"); strcat(output_dirname, problem); mkdir(problem, 0777); output_dir = opendir(problem); /* Delete existing files in output folder*/ while((old_outputfile = readdir(output_dir))) { sprintf(old_output_filename, "%s/%s", problem, old_outputfile->d_name); remove(old_output_filename); } /* Determine subdomain and neighbours for each process */ init_parallel(iproc, jproc, imax, jmax, &myrank, &il, &ir, &jb, &jt, &rank_l, &rank_r, &rank_b, &rank_t, &omg_i, &omg_j, num_proc); /* Set up the matrices (arrays) needed using the matrix() command */ U = matrix(il-2, ir+1, jb-1, jt+1); V = matrix(il-1, ir+1, jb-2, jt+1); P = matrix(il-1, ir+1, jb-1, jt+1); F = matrix(il-2, ir+1, jb-1, jt+1); G = matrix(il-1, ir+1, jb-2, jt+1); RS= matrix(il, ir, jb, jt); Flag = imatrix(il-1, ir+1, jb-1, jt+1); /* Assign initial values to u, v, p */ init_uvp(UI, VI, PI, il, ir, jb, jt, U, V, P); /* Allocate memory for buffers */ bufSend = malloc(max(ir-il+3, jt-jb+3) * sizeof(double)); bufRecv = malloc(max(ir-il+3, jt-jb+3) * sizeof(double)); /* Initialize lower part of the domain with UI = 0 for the flow_over_step problem */ /* (this code might be moved to somewhere else later) */ if(strcmp(problem, "flow_over_step") == 0) { init_matrix(U, il, ir, jb, min(jmax/2, jt), 0); } /* Initialization of flag field */ init_flag(pgm, imax, jmax, il, ir, jb, jt, Flag, dp); if(myrank == 0) { clock_gettime(CLOCK_MONOTONIC, ¤tTime); } while(t <= t_end){ /* Select δt */ calculate_dt(Re, tau, &dt, dx, dy, il, ir, jb, jt, U, V); MPI_Allreduce(&dt, &min_dt, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); dt = min_dt; /* Set boundary values for u and v */ boundaryvalues(il, ir, jb, jt, imax, jmax, U, V, wl, wr, wt, wb, Flag); /* Set special boundary values */ spec_boundary_val(problem, il, ir, jb, jt, imax, jmax, U, V, P, Re, xlength, ylength, dp); /* Compute F(n) and G(n) */ calculate_fg(Re, GX, GY, alpha, dt, dx, dy, il, ir, jb, jt, imax, jmax, U, V, F, G, Flag); /* Compute the right-hand side rs of the pressure equation */ calculate_rs(dt, dx, dy, il, ir, jb, jt, imax, jmax, F, G, RS); /* Perform SOR iterations */ it = 0; res = 1e6; while(it < itermax && res > eps){ sor(omg, dx, dy, dp, il, ir, jb, jt, imax, jmax, rank_l, rank_r, rank_b, rank_t, P, RS, &res, Flag, bufSend, bufRecv); it++; } /* Compute u(n+1) and v(n+1) */ calculate_uv(dt, dx, dy, il, ir, jb, jt, imax, jmax, U, V, F, G, P, Flag); /* Exchange velocity strips */ uv_com(U, V, il, ir, jb, jt, rank_l, rank_r, rank_b, rank_t, bufSend, bufRecv); t = t + dt; n++; /* Generate snapshot for current timestep */ if((int) n % timestepsPerPlotting == 0) { write_vtkFile(output_dirname, myrank, n, xlength, ylength, il, ir, jb, jt, imax, jmax, dx, dy, U, V, P); } /* Print out simulation time and whether the SOR converged */ if(myrank == 0) { /* Print simulation time */ printf("Time: %.4f", t); /* Print runtime */ previousTime = currentTime; clock_gettime(CLOCK_MONOTONIC, ¤tTime); totalTime += (double)currentTime.tv_sec + 1e-9 * currentTime.tv_nsec - (double)previousTime.tv_sec - 1e-9 * previousTime.tv_nsec; printf("\tRuntime: %.3f s (avg runtime/step: %.3f s)", totalTime, totalTime/n); if(res > eps) printf("\tDid not converge (res=%f, eps=%f)", res, eps); printf("\n"); } } /* Close the output folder */ closedir(output_dir); /* Tell user where to find the output */ if(myrank == 0) { printf("Please find the output in the folder \"%s\".\n", problem); } /* Free allocated memory */ free_matrix(U, il-2, ir+1, jb-1, jt+1); free_matrix(V, il-1, ir+1, jb-2, jt+1); free_matrix(P, il-1, ir+1, jb-1, jt+1); free_matrix(F, il-2, ir+1, jb-1, jt+1); free_matrix(G, il-1, ir+1, jb-2, jt+1); free_matrix(RS, il, ir, jb, jt); free_imatrix(Flag, il-1, ir+1, jb-1, jt+1); free(bufSend); free(bufRecv); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); return 0; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing cgetrf */ int main( int argc, char** argv) { real_Double_t gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0; float error; magmaFloatComplex *h_A; magma_int_t *ipiv; magma_int_t M, N, n2, lda, ldda, info, min_mn; magma_int_t status = 0; /* Initialize */ magma_queue_t queue[2]; magma_device_t devices[MagmaMaxGPUs]; int num = 0; magma_err_t err; magma_init(); magma_opts opts; parse_opts( argc, argv, &opts ); float tol = opts.tolerance * lapackf77_slamch("E"); err = magma_get_devices( devices, MagmaMaxGPUs, &num ); if ( err != 0 || num < 1 ) { fprintf( stderr, "magma_get_devices failed: %d\n", err ); exit(-1); } // Create two queues on device opts.device err = magma_queue_create( devices[opts.device], &queue[0] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } err = magma_queue_create( devices[opts.device], &queue[1] ); if ( err != 0 ) { fprintf( stderr, "magma_queue_create failed: %d\n", err ); exit(-1); } printf("ngpu %d\n", (int) opts.ngpu ); if ( opts.check == 2 ) { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); } else { printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |PA-LU|/(N*|A|)\n"); } printf("=========================================================================\n"); for( int i = 0; i < opts.ntest; ++i ) { for( int iter = 0; iter < opts.niter; ++iter ) { M = opts.msize[i]; N = opts.nsize[i]; min_mn = min(M, N); lda = M; n2 = lda*N; ldda = ((M+31)/32)*32; gflops = FLOPS_CGETRF( M, N ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn ); TESTING_MALLOC_PIN( h_A, magmaFloatComplex, n2 ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { init_matrix( M, N, h_A, lda ); cpu_time = magma_wtime(); lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( M, N, h_A, lda ); gpu_time = magma_wtime(); magma_cgetrf( M, N, h_A, lda, ipiv, &info, queue); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_cgetrf returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) M, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 2 ) { error = get_residual( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else if ( opts.check ) { error = get_LU_error( M, N, h_A, lda, ipiv ); printf(" %8.2e%s\n", error, (error < tol ? "" : " failed")); status |= ! (error < tol); } else { printf(" --- \n"); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_A ); } if ( opts.niter > 1 ) { printf( "\n" ); } } magma_queue_destroy( queue[0] ); magma_queue_destroy( queue[1] ); magma_finalize(); return status; }
/**** WEKA specific functions *********/ matrix * WEKApopulateAccuracyMatrix(struct hash * config, int split, int fold) { char * trainingDir = hashMustFindVal(config, "trainingDir"); char * validationDir = hashMustFindVal(config, "validationDir"); char * modelDir = hashMustFindVal(config, "modelDir"); char filename[256]; //cat togetehr the training and validation KH values and record which were used to train safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", trainingDir, split, fold); matrix * trMetadata = WEKAtoMetadataMatrix(filename); safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", validationDir, split, fold); matrix * valMetadata = WEKAtoMetadataMatrix(filename); matrix * metadata = append_matrices(trMetadata, valMetadata, 1); struct slInt * trainingList = list_indices(trMetadata->cols); //create a labeled matrix for results to be stored in matrix * result = init_matrix(2, metadata->cols); safef(result->rowLabels[0], MAX_LABEL, "trainingAccuracies"); safef(result->rowLabels[1], MAX_LABEL, "testingAccuracies"); copy_matrix_labels(result, metadata, 2,2); result->labels=1; //read the results from file safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.training.results", modelDir, split, fold); FILE * fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open %s for reading.", filename); //advance the cursor to where data starts char * line; while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } //read each result and save to results matrix int i; for(i = 0; i < trMetadata->cols && (line = readLine(fp)) != NULL; i++) { if(strstr(line, ":?") == NULL) { if(strstr(line, " + ") == NULL) result->graph[0][i] = 1; else result->graph[0][i] = 0; } } safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.validation.results", modelDir, split, fold); fp = fopen(filename, "r"); if(fp == NULL) errAbort("Couldn't open %s for reading.", filename); //advance the cursor to where data starts while( (line = readLine(fp)) && line != NULL) { if(strstr(line, "inst#") != NULL) break; } //read each result and save to results matrix for(i = i; i < result->cols && (line = readLine(fp)) != NULL; i++) { if(strstr(line, ":?") == NULL) { if(strstr(line, " + ") == NULL) result->graph[1][i] = 1; else result->graph[1][i] = 0; } } free_matrix(trMetadata); free_matrix(valMetadata); free_matrix(metadata); slFreeList(&trainingList); return result; }
int pv_parse_json_name (pv_spec_p sp, str *in) { json_name * id; char * cur,* start; int state,next_state,prev_state; if( !inited ) init_matrix(); id = (json_name *) pkg_malloc(sizeof(json_name)); if( id == NULL ) { LM_ERR("Out of memory\n"); return -1; } id->tags = NULL; id->end = &id->tags; state = ST_NAME; start = in->s; prev_state = -1; for( cur = in->s; cur < in->s + in->len; cur++) { next_state = next[state][(unsigned int)*cur]; if( next_state == ST_ERR) { LM_ERR("Unexpected char at position: %d in :(%.*s)\n", (int)(cur-in->s),in->len,in->s); return -1; } if( state != prev_state) start = cur; if( state != next_state) if ( get_value(state, id, start, cur) ) return -1; if( ignore[state][(unsigned int)*cur]) { cur --; } prev_state = state; state = next_state; } if( state == ST_IDX) { LM_ERR("Mismatched paranthesis in:(%.*s)\n",in->len,in->s); return -1; } if( get_value(state, id, start, cur) ) return -1; sp->pvp.pvn.u.dname = id ; sp->type = PV_JSON_ID; sp->getf = pv_get_json; sp->setf = pv_set_json; return 0; }
int main(int argc, char *argv[]) { int portno; socklen_t clilen; struct sockaddr_in serv_addr, cli_addr; int n; if (argc < 2) { fprintf(stderr,"ERROR, no port provided\n"); exit(1); } sockfd = socket(AF_INET, SOCK_STREAM, 0); if (sockfd < 0) error("ERROR opening socket"); bzero((char *) &serv_addr, sizeof(serv_addr)); portno = atoi(argv[1]); serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr =inet_addr("127.0.0.1"); //INADDR_ANY; serv_addr.sin_port = htons(portno); if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) error("ERROR on binding"); listen(sockfd,5); clilen = sizeof(cli_addr); puts("This is the game of Tic Tac Toe.\n"); puts("waiting connection to established....\n"); newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen); if (newsockfd < 0) error("ERROR on accept"); char done; done = ' '; init_matrix(); do { disp_matrix(); puts("waiting for client move\n"); get_client_move(); disp_matrix(); done = check(); /* see if winner */ if(done!= ' ') break; /* winner!*/ get_player_move(); disp_matrix(); done = check(); /* see if winner */ } while(done== ' '); if(done=='X') printf("Player X won!\n"); else printf("Player O won!!!!\n"); disp_matrix(); /* show final positions */ close(newsockfd); close(sockfd); return 0; }
int main(int argc, char **argv) { int myrank, nproc; int rows, columns; /* amount of work per node (rows per worker) */ int mtype; /* message type: send/recv between master and workers */ int dest, src, offsetrow, offsetcolumn; double start_time, end_time; int i, j, k; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); if (nproc == 4 || nproc == 2) { rows = SIZE / 2; } else if (nproc == 1) { rows = SIZE; } if (nproc == 4) { columns = SIZE / 2; } else if (nproc == 2 || nproc == 1) { columns = SIZE; } MPI_Type_contiguous(SIZE*rows,MPI_DOUBLE,&rowtype); MPI_Type_commit(&rowtype); MPI_Type_vector(SIZE,columns,SIZE,MPI_DOUBLE,&columntype); MPI_Type_commit(&columntype); MPI_Type_vector(rows,columns,SIZE,MPI_DOUBLE,&resulttype); MPI_Type_commit(&resulttype); if (myrank == 0) { /* Master task */ /* Initialization */ // printf("SIZE = %d, number of nodes = %d\n", SIZE, nproc); init_matrix(); start_time = MPI_Wtime(); /* Send part of matrix a and the whole matrix b to workers */ mtype = FROM_MASTER; offsetrow = 0; offsetcolumn = 0; for (dest = 1; dest < nproc; dest++) { if (DEBUG) printf(" sending %d rows and %d columns to task %d\n",rows,columns,dest); offsetrow = (offsetrow+rows)%SIZE; if (dest == 2) { offsetcolumn = (offsetcolumn+columns)%SIZE; } MPI_Send(&offsetrow, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD); MPI_Send(&offsetcolumn, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD); MPI_Send(&columns, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD); MPI_Send(&a[offsetrow][0], 1, rowtype, dest, mtype, MPI_COMM_WORLD); MPI_Send(&b[0][offsetcolumn], 1, columntype, dest, mtype, MPI_COMM_WORLD); } printf(" ---- SEND ----- Execution time on %2d nodes: %5.2f\n", myrank, MPI_Wtime()-start_time); /* let master do its part of the work */ for (i = 0; i < rows; i++) { for (j = 0; j < columns; j++) { c[i][j] = 0; for (k = 0; k < SIZE; k++) { c[i][j] += a[i][k] * b[k][j]; } } } printf("---- algo ----- Execution time on %2d nodes: %5.2f\n", myrank, MPI_Wtime()-start_time); /* collect the results from all the workers */ mtype = FROM_WORKER; for (src = 1; src < nproc; src++) { MPI_Recv(&offsetrow, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&offsetcolumn, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&columns, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&c[offsetrow][offsetcolumn], 1, resulttype, src, mtype, MPI_COMM_WORLD, &status); if (DEBUG) printf(" recvd %d rows and %d columns from task %d, offsetrow = %d, offsetcolumn = %d\n", rows, columns, src, offsetrow, offsetcolumn); } printf(" ---- RECV ----- Execution time on %2d nodes: %5.2f\n", myrank, MPI_Wtime()-start_time); end_time = MPI_Wtime(); printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time); //if (DEBUG) /* Prints the resulting matrix c */ //print_matrix(); } else { /* Worker tasks */ /* Receive data from master */ mtype = FROM_MASTER; MPI_Recv(&offsetrow, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&offsetcolumn, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&columns, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&a[offsetrow][0],1, rowtype, 0, mtype, MPI_COMM_WORLD, &status); MPI_Recv(&b[0][offsetcolumn], 1, columntype, 0, mtype, MPI_COMM_WORLD, &status); if (DEBUG) printf ("Rank=%d, offsetrow=%d,offsetcolumn=%d, row =%d, column=%d, a[offsetrow][0]=%e, b[0][offsetcolumn]=%e\n", myrank, offsetrow, offsetcolumn, rows, columns, a[offsetrow][0], b[0][offsetcolumn]); /* do the workers part of the calculation */ for (i=offsetrow; i<offsetrow+rows; i++) { for (j=offsetcolumn; j<offsetcolumn+columns; j++) { c[i][j] = 0.0; for (k=0; k<SIZE; k++){ c[i][j] = c[i][j] + a[i][k] * b[k][j]; } } } if (DEBUG) printf ("Rank=%d, offsetrow=%d, offsetcolumn=%d,row =%d, column=%d, c[offsetrow][0]=%e\n", myrank, offsetrow,offsetcolumn, rows, columns, a[offsetrow][0]); /* send the results to the master */ mtype = FROM_WORKER; MPI_Send(&offsetrow, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD); MPI_Send(&offsetcolumn, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD); MPI_Send(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD); MPI_Send(&columns, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD); MPI_Send(&c[offsetrow][offsetcolumn], 1, resulttype, 0, mtype, MPI_COMM_WORLD); } MPI_Finalize(); return 0; }
int main(void) { pixel *input; pixel *scalar_input; #if USE_LUMA unsigned char *vbx_luma; #endif unsigned short *scalar_luma; pixel *vbx_output; pixel *scalar_output; vbx_timestamp_t time_start, time_stop; double scalar_time, vbx_time; int x, y; int errors = 0; vbx_test_init(); vbx_mxp_print_params(); input = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); scalar_input = (pixel *)vbx_remap_cached(input, IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); #if USE_LUMA vbx_luma = (unsigned char *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned char)); #endif scalar_luma = (unsigned short *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short)); vbx_output = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); scalar_output = (pixel *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel)); printf("\nInitializing data\n"); printf("Resolution = %dx%d\n", IMAGE_WIDTH, IMAGE_HEIGHT); init_matrix(input, IMAGE_WIDTH, IMAGE_HEIGHT); printf("Starting Sobel 3x3 edge-detection test\n"); #if USE_LUMA scalar_rgb2luma(scalar_luma, scalar_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH); #endif vbx_timestamp_start(); time_start = vbx_timestamp(); #if !USE_LUMA scalar_rgb2luma(scalar_luma, scalar_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH); #endif scalar_sobel_argb32_3x3(scalar_output, scalar_luma, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT); time_stop = vbx_timestamp(); scalar_time = vbx_print_scalar_time(time_start, time_stop); #if USE_LUMA vbw_rgb2luma8(vbx_luma, (unsigned *)input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH); #endif vbx_timestamp_start(); time_start = vbx_timestamp(); #if USE_LUMA vbw_sobel_luma8_3x3((unsigned *)vbx_output, vbx_luma, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT); #else vbw_sobel_argb32_3x3((unsigned *)vbx_output, (unsigned *)input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT); #endif time_stop = vbx_timestamp(); vbx_time = vbx_print_vector_time(time_start, time_stop, scalar_time); for (y = 0; y < IMAGE_HEIGHT; y++) { for (x = 0; x < IMAGE_WIDTH; x++) { #if USE_LUMA if (scalar_luma[y*IMAGE_WIDTH+x] != vbx_luma[y*IMAGE_WIDTH+x]) { if (errors < MAX_PRINT_ERRORS) { printf("Y Error at %d, %d: Expected = %02X, got = %02X\n", y, x, scalar_luma[y*IMAGE_WIDTH+x], vbx_luma[y*IMAGE_WIDTH+x]); } errors++; } #endif if (scalar_output[y*IMAGE_WIDTH+x].r != vbx_output[y*IMAGE_WIDTH+x].r) { if (errors < MAX_PRINT_ERRORS) { printf("R Error at %d, %d: Expected = %02X, got = %02X\n", y, x, scalar_output[y*IMAGE_WIDTH+x].r, vbx_output[y*IMAGE_WIDTH+x].r); } errors++; } if (scalar_output[y*IMAGE_WIDTH+x].g != vbx_output[y*IMAGE_WIDTH+x].g) { if (errors < MAX_PRINT_ERRORS) { printf("G Error at %d, %d: Expected = %02X, got = %02X\n", y, x, scalar_output[y*IMAGE_WIDTH+x].g, vbx_output[y*IMAGE_WIDTH+x].g); } errors++; } if (scalar_output[y*IMAGE_WIDTH+x].b != vbx_output[y*IMAGE_WIDTH+x].b) { if (errors < MAX_PRINT_ERRORS) { printf("B Error at %d, %d: Expected = %02X, got = %02X\n", y, x, scalar_output[y*IMAGE_WIDTH+x].b, vbx_output[y*IMAGE_WIDTH+x].b); } errors++; } } } VBX_TEST_END(errors); return errors; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dsysv */ int main( int argc, char** argv) { TESTING_INIT(); double *h_A, *h_B, *h_X, *work, temp; real_Double_t gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0; double error, error_lapack = 0.0; magma_int_t *ipiv; magma_int_t N, n2, lda, ldb, sizeB, lwork, info; magma_int_t status = 0, ione = 1; magma_int_t ISEED[4] = {0,0,0,1}; magma_opts opts; parse_opts( argc, argv, &opts ); double tol = opts.tolerance * lapackf77_dlamch("E"); printf(" M N CPU GFlop/s (sec) GPU GFlop/s (sec) |Ax-b|/(N*|A|*|x|)\n"); printf("=========================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { N = opts.nsize[itest]; ldb = N; lda = N; n2 = lda*N; sizeB = ldb*opts.nrhs; gflops = ( FLOPS_DPOTRF( N ) + FLOPS_DPOTRS( N, opts.nrhs ) ) / 1e9; TESTING_MALLOC_CPU( ipiv, magma_int_t, N ); TESTING_MALLOC_PIN( h_A, double, n2 ); TESTING_MALLOC_PIN( h_B, double, sizeB ); TESTING_MALLOC_PIN( h_X, double, sizeB ); /* ===================================================================== Performs operation using LAPACK =================================================================== */ if ( opts.lapack ) { lwork = -1; lapackf77_dsysv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info); lwork = (int)MAGMA_D_REAL(temp); TESTING_MALLOC_CPU( work, double, lwork ); init_matrix( N, N, h_A, lda ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); cpu_time = magma_wtime(); lapackf77_dsysv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info); cpu_time = magma_wtime() - cpu_time; cpu_perf = gflops / cpu_time; if (info != 0) printf("lapackf77_dsysv returned error %d: %s.\n", (int) info, magma_strerror( info )); error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); TESTING_FREE_CPU( work ); } /* ==================================================================== Performs operation using MAGMA =================================================================== */ init_matrix( N, N, h_A, lda ); lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B ); lapackf77_dlacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb ); magma_setdevice(0); gpu_time = magma_wtime(); magma_dsysv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info); gpu_time = magma_wtime() - gpu_time; gpu_perf = gflops / gpu_time; if (info != 0) printf("magma_dsysv returned error %d: %s.\n", (int) info, magma_strerror( info )); /* ===================================================================== Check the factorization =================================================================== */ if ( opts.lapack ) { printf("%5d %5d %7.2f (%7.2f) %7.2f (%7.2f)", (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time ); } else { printf("%5d %5d --- ( --- ) %7.2f (%7.2f)", (int) N, (int) N, gpu_perf, gpu_time ); } if ( opts.check == 0 ) { printf(" --- \n"); } else { error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb ); printf(" %8.2e %s", error, (error < tol ? "ok" : "failed")); if (opts.lapack) printf(" (lapack rel.res. = %8.2e)", error_lapack); printf("\n"); status += ! (error < tol); } TESTING_FREE_CPU( ipiv ); TESTING_FREE_PIN( h_X ); TESTING_FREE_PIN( h_B ); TESTING_FREE_PIN( h_A ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
/* //////////////////////////////////////////////////////////////////////////// -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx */ int main( int argc, char** argv) { TESTING_INIT(); double *h_A1, *h_A2; double *d_A1, *d_A2; double *h_R1, *h_R2; // row-major and column-major performance real_Double_t row_perf0, col_perf0; real_Double_t row_perf1, col_perf1; real_Double_t row_perf2, col_perf2; real_Double_t row_perf3; real_Double_t row_perf4; real_Double_t row_perf5, col_perf5; real_Double_t row_perf6, col_perf6; real_Double_t row_perf7; real_Double_t cpu_perf; real_Double_t time, gbytes; magma_int_t N, lda, ldda, nb, j; magma_int_t ione = 1; magma_int_t *ipiv, *ipiv2; magma_int_t *d_ipiv; magma_int_t status = 0; magma_opts opts; parse_opts( argc, argv, &opts ); magma_queue_t queue = 0; printf(" cublasDswap dswap dswapblk dlaswp dpermute dlaswp2 dlaswpx dcopymatrix CPU (all in )\n"); printf(" N nb row-maj/col-maj row-maj/col-maj row-maj/col-maj row-maj row-maj row-maj row-maj/col-maj row-blk/col-blk dlaswp (GByte/s)\n"); printf("==================================================================================================================================================\n"); for( int itest = 0; itest < opts.ntest; ++itest ) { for( int iter = 0; iter < opts.niter; ++iter ) { // For an N x N matrix, swap nb rows or nb columns using various methods. // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure. // The variable 'shift' keeps track of which bit is for current test int shift = 1; int check = 0; N = opts.nsize[itest]; lda = N; ldda = ((N+31)/32)*32; nb = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N )); nb = min( N, nb ); // each swap does 2N loads and 2N stores, for nb swaps gbytes = sizeof(double) * 4.*N*nb / 1e9; TESTING_MALLOC_PIN( h_A1, double, lda*N ); TESTING_MALLOC_PIN( h_A2, double, lda*N ); TESTING_MALLOC_PIN( h_R1, double, lda*N ); TESTING_MALLOC_PIN( h_R2, double, lda*N ); TESTING_MALLOC_CPU( ipiv, magma_int_t, nb ); TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb ); TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb ); TESTING_MALLOC_DEV( d_A1, double, ldda*N ); TESTING_MALLOC_DEV( d_A2, double, ldda*N ); for( j=0; j < nb; j++ ) { ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1; } /* ===================================================================== * cublasDswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda); } } time = magma_sync_wtime( queue ) - time; col_perf0 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswap, row-by-row (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1); } } time = magma_sync_wtime( queue ) - time; row_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda ); } } time = magma_sync_wtime( queue ) - time; col_perf1 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dswapblk, blocked version (2 matrices) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; row_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* Column Major */ init_matrix( N, N, h_A1, lda, 0 ); init_matrix( N, N, h_A2, lda, 100 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda ); time = magma_sync_wtime( queue ); magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0); time = magma_sync_wtime( queue ) - time; col_perf2 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda ); check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) || diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift; shift *= 2; /* ===================================================================== * dpermute_long (1 matrix) */ /* Row Major */ memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) ); // dpermute updates ipiv2 init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 ); time = magma_sync_wtime( queue ) - time; row_perf3 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf4 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 ); magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 ); time = magma_sync_wtime( queue ) - time; row_perf7 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix) */ /* Row Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; row_perf5 = gbytes / time; for( j=0; j < nb; j++) { if ( j != (ipiv[j]-1)) { blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione); } } magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* Col Major */ init_matrix( N, N, h_A1, lda, 0 ); magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda ); time = magma_sync_wtime( queue ); magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1); time = magma_sync_wtime( queue ) - time; col_perf5 = gbytes / time; time = magma_wtime(); lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione); time = magma_wtime() - time; cpu_perf = gbytes / time; magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda ); check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift; shift *= 2; /* ===================================================================== * Copy matrix. */ time = magma_sync_wtime( queue ); magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap col_perf6 = 0.5 * gbytes / time; time = magma_sync_wtime( queue ); magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda ); time = magma_sync_wtime( queue ) - time; // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap row_perf6 = 0.5 * gbytes / time; printf("%5d %3d %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c/ %6.2f%c %6.2f%c %6.2f%c %6.2f%c %6.2f%c/ %6.2f%c %6.2f / %6.2f %6.2f %10s\n", (int) N, (int) nb, row_perf0, ((check & 0x001) != 0 ? '*' : ' '), col_perf0, ((check & 0x002) != 0 ? '*' : ' '), row_perf1, ((check & 0x004) != 0 ? '*' : ' '), col_perf1, ((check & 0x008) != 0 ? '*' : ' '), row_perf2, ((check & 0x010) != 0 ? '*' : ' '), col_perf2, ((check & 0x020) != 0 ? '*' : ' '), row_perf3, ((check & 0x040) != 0 ? '*' : ' '), row_perf4, ((check & 0x080) != 0 ? '*' : ' '), row_perf7, ((check & 0x100) != 0 ? '*' : ' '), row_perf5, ((check & 0x200) != 0 ? '*' : ' '), col_perf5, ((check & 0x400) != 0 ? '*' : ' '), row_perf6, col_perf6, cpu_perf, (check == 0 ? "ok" : "* failed") ); status += ! (check == 0); TESTING_FREE_PIN( h_A1 ); TESTING_FREE_PIN( h_A2 ); TESTING_FREE_PIN( h_R1 ); TESTING_FREE_PIN( h_R2 ); TESTING_FREE_CPU( ipiv ); TESTING_FREE_CPU( ipiv2 ); TESTING_FREE_DEV( d_ipiv ); TESTING_FREE_DEV( d_A1 ); TESTING_FREE_DEV( d_A2 ); fflush( stdout ); } if ( opts.niter > 1 ) { printf( "\n" ); } } TESTING_FINALIZE(); return status; }
ss_controller::ss_controller(int inputs, int outputs, int states, controllers controller) : num_inputs(inputs), num_outputs(outputs), num_states(states) { //initalizes all the matrices A = init_matrix(num_states, num_states); B = init_matrix(num_states, num_outputs); C = init_matrix(num_outputs, num_states); D = init_matrix(num_outputs, num_outputs); L = init_matrix(num_states, num_outputs); K = init_matrix(num_outputs, num_states); X = init_matrix(num_states, 1); X_hat = init_matrix(num_states, 1); U = init_matrix(num_outputs, 1); U_max = init_matrix(num_outputs, 1); U_min = init_matrix(num_outputs, 1); U_tmp = init_matrix(num_states, 1); b_u = init_matrix(num_states, 1); l_y = init_matrix(num_states, 1); l_c = init_matrix(num_states, num_states); a_lc = init_matrix(num_states, num_states); alc_xhat = init_matrix(num_states, 1); xhatp1 = init_matrix(num_states, 1); //import the matlab-computed matrix values switch (controller) { case SHOOTER: #include "shootercontroller.h" break; case DRIVE: #include "drivecontroller.h" break; default: break; } }
int main(int argc, char **argv) { unsigned int i, j; unsigned int iterations = 0; double error, xi, norm, max = 0.0; //Neue Variablen double sum = 0.0; double epsilon = sqrt(0.00000001*MATRIX_SIZE); double sumindistance = 0.0; //Neue Variablen end struct timeval start, end; printf("\nInitialize system of linear equations...\n"); /* allocate memory for the system of linear equations */ init_matrix(&A, &b, MATRIX_SIZE); X = (double *)malloc(sizeof(double) * MATRIX_SIZE); X_old = (double *)malloc(sizeof(double) * MATRIX_SIZE); /* a "random" solution vector */ for (i = 0; i < MATRIX_SIZE; i++) { X[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0; X_old[i] = 0.0; } printf("Start Jacobi method...\n"); gettimeofday(&start, NULL); /* TODO: Hier muss die Aufgabe geloest werden */ norm = 1.0; //Loesung suchen, bis Abstand aufeinanderfolgender Loesungen sehr klein ist while (norm > epsilon) { //Alle X einmal durchgehen for (i = 0; i < MATRIX_SIZE; i++) { //Summe berechnen sum = 0.0; for (j = 0; j < MATRIX_SIZE; j ++) { if (j == i) { j++; } sum = sum + A[i][j]*X_old[j]; }//Summe end xi = X[i]; X[i] = 1 / A[i][i] * (b[i] - sum); X_old[i] = xi; }//Alle X end //Abstand berechnen //Summe im Abstand sumindistance = 0.0; for (i = 0; i < MATRIX_SIZE; i ++) { sumindistance = sumindistance + (X_old[i]-X[i])*(X_old[i]-X[i]); }//Abstandsumme end norm = sqrt(sumindistance); iterations++; }//while end gettimeofday(&end, NULL); if (MATRIX_SIZE < 16) { printf("Print the solution...\n"); /* print solution */ for (i = 0; i < MATRIX_SIZE; i++) { for (j = 0; j < MATRIX_SIZE; j++) printf("%8.2f\t", A[i][j]); printf("*\t%8.2f\t=\t%8.2f\n", X[i], b[i]); } } printf("Check the result...\n"); /* * check the result * X[i] have to be 1 */ for (i = 0; i < MATRIX_SIZE; i++) { error = fabs(X[i] - 1.0f); if (max < error) max = error; if (error > 0.01f) printf("Result is on position %d wrong (%f != 1.0)\n", i, X[i]); } printf("maximal error is %f\n", max); printf("\nmatrix size: %d x %d\n", MATRIX_SIZE, MATRIX_SIZE); printf("number of iterations: %d\n", iterations); printf("Time : %lf sec\n", (double)(end.tv_sec - start.tv_sec) + (double)(end.tv_usec - start.tv_usec) / 1000000.0); /* frees the allocated memory */ free(X_old); free(X); clean_matrix(&A); clean_vector(&b); return 0; }