예제 #1
0
파일: blas2.cpp 프로젝트: GnsP/viennacl-dev
VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsv(ViennaCLMatrix A, ViennaCLVector x, ViennaCLUplo uplo)
{
  viennacl::backend::mem_handle v1_handle;
  viennacl::backend::mem_handle A_handle;

  if (init_vector(v1_handle, x) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  if (init_matrix(A_handle, A) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  switch (x->precision)
  {
    case ViennaCLFloat:
    {
      viennacl::vector_base<float> v1(v1_handle, x->size, x->offset, x->inc);

      viennacl::matrix_base<float> mat(A_handle,
                                       A->size1, A->start1, A->stride1, A->internal_size1,
                                       A->size2, A->start2, A->stride2, A->internal_size2, A->order == ViennaCLRowMajor);
      if (A->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper)
          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
        else
          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
      }
      else
      {
        if (uplo == ViennaCLUpper)
          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
        else
          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
      }

      return ViennaCLSuccess;
    }
    case ViennaCLDouble:
    {
      viennacl::vector_base<double> v1(v1_handle, x->size, x->offset, x->inc);

      viennacl::matrix_base<double> mat(A_handle,
                                        A->size1, A->start1, A->stride1, A->internal_size1,
                                        A->size2, A->start2, A->stride2, A->internal_size2, A->order == ViennaCLRowMajor);
      if (A->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper)
          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::upper_tag());
        else
          viennacl::linalg::inplace_solve(viennacl::trans(mat), v1, viennacl::linalg::lower_tag());
      }
      else
      {
        if (uplo == ViennaCLUpper)
          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::upper_tag());
        else
          viennacl::linalg::inplace_solve(mat, v1, viennacl::linalg::lower_tag());
      }

      return ViennaCLSuccess;
    }

    default:
      return  ViennaCLGenericFailure;
  }
}
예제 #2
0
/* assuming slaves (workers)) are all homogenous, let them all do the calculations
 regarding primes sieving, calculating the smoothness base and the modular roots */
int main(int argc, char **argv) {
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &mpi_group_size);
	int len;
	MPI_Get_processor_name(processor_name, &len);

	gettimeofday(&start_global, NULL);
	print_lib_version();

	mpz_init(N);
	mpz_t B;
	mpz_init(B);

	unsigned long int uBase;
	int64_t nb_primes;
	modular_root_t *modular_roots;

	uint64_t i, j;

	if (argc < 2) {
		PRINT(my_rank, "usage: %s Number_to_factorize\n", argv[0]);
		exit(2);
	}

	if (mpz_init_set_str(N, argv[1], 10) == -1) {
		PRINT(my_rank, "Cannot load N %s\n", argv[1]);
		exit(2);
	}

	mpz_t sqrtN, rem;
	mpz_init(sqrtN);
	mpz_init(rem);
	mpz_sqrtrem(sqrtN, rem, N);

	if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */
		mpz_add_ui(sqrtN, sqrtN, 1);
	else /* N is a perfect square, factored! */
	{
		PRINT(my_rank, "\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN));
		return 0;
	}

	if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */
	{
		PRINT(my_rank, "N:%s is prime\n", mpz_get_str(NULL, 10, N));
		exit(0);
	}

	OPEN_LOG_FILE("freq");

//--------------------------------------------------------
//  calculate the smoothness base for the given N
//--------------------------------------------------------
	get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */
	uBase = mpz_get_ui(B);
	PRINT(my_rank, "n: %s\tBase: %s\n",
			mpz_get_str(NULL, 10, N), mpz_get_str(NULL, 10, B));

//--------------------------------------------------------
// sieve primes that are less than the smoothness base using Eratosthenes sieve
//--------------------------------------------------------
	START_TIMER();
	nb_primes = sieve_primes_up_to((int64_t) (uBase));

	PRINT(my_rank, "\tPrimes found %" PRId64 " [Smoothness Base %lu]\n",
			nb_primes, uBase);
	STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done");

//--------------------------------------------------------
// fill the primes array with primes to which n is a quadratic residue
//--------------------------------------------------------
	START_TIMER();
	primes = calloc(nb_primes, sizeof(int64_t));
	nb_qr_primes = fill_primes_with_quadratic_residue(primes, N);

	/*for(i=0; i<nb_qr_primes; i++)
	 PRINT(my_rank, "%" PRId64 "\n", primes[i]);*/

	PRINT(my_rank, "\tN-Quadratic primes found %" PRId64 "\n", nb_qr_primes);
	STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done");

//--------------------------------------------------------
// calculate modular roots
//--------------------------------------------------------
	START_TIMER();
	modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t));
	mpz_t tmp, r1, r2;
	mpz_init(tmp);
	mpz_init(r1);
	mpz_init(r2);

	for (i = 0; i < nb_qr_primes; i++) {
		mpz_set_ui(tmp, (unsigned long) primes[i]);
		mpz_sqrtm(r1, N, tmp); /* calculate the modular root */
		mpz_neg(r2, r1); /* -q mod n */
		mpz_mod(r2, r2, tmp);

		modular_roots[i].root1 = mpz_get_ui(r1);
		modular_roots[i].root2 = mpz_get_ui(r2);
	}
	mpz_clear(tmp);
	mpz_clear(r1);
	mpz_clear(r2);
	STOP_TIMER_PRINT_TIME("Modular roots calculation done");

//--------------------------------------------------------
//         ***** initialize the matrix *****
//--------------------------------------------------------
	if (my_rank == 0) /* only the master have the matrix */
	{
		START_TIMER();
		init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes);
		mpz_init2(tmp_matrix_row, nb_qr_primes);
		STOP_TIMER_PRINT_TIME("Matrix initialized");
	}

//--------------------------------------------------------
// [Sieving] - everyones sieves including the master
//--------------------------------------------------------
	START_TIMER();

	mpz_t x, sieving_index, next_sieving_index, relative_start, global_step;
	unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */
	int LOCAL_SIEVING_ROUNDS = 10; /* number of iterations a worker sieves before communicating results to the master */
	unsigned long sieving_round = 0;
	unsigned long nb_big_rounds = 0;

	uint64_t p_pow;
	smooth_number_t *x_squared;

	x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t));

	if (my_rank == 0)
		smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET,
				sizeof(smooth_number_t));
	else
		temp_slaves_smooth_numbers = calloc(500, sizeof(smooth_number_t));
	/* TODO: this is not properly correct, using a linkedlist is better to keep track of temporary
	 * smooth numbers at the slaves nodes however it's pretty rare to find 500 smooth numbers in
	 * 50000 * 10 interval. */

	mpz_init_set(x, sqrtN);
	mpz_init(global_step);
	mpz_init(relative_start);
	mpz_init(sieving_index);
	mpz_init(next_sieving_index);

	mpz_t p;
	mpz_init(p);
	mpz_t str;
	mpz_init_set(str, sieving_index);
	PRINT(my_rank, "\n[%s] Sieving ...\n", processor_name);

//--------------------------------------------------------
// Init before sieving
//--------------------------------------------------------
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_init(x_squared[i].value_x);
		mpz_init(x_squared[i].value_x_squared);

		mpz_init2(x_squared[i].factors_vect, nb_qr_primes);
		mpz_add_ui(x, x, 1);
	}

	int nb_smooth_per_round = 0;
	char s[512];

//--------------------------------------------------------
// WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET for master
// Or master asked for more smooth numbers from slaves
//--------------------------------------------------------
	while (1) {
		mpz_set_ui(global_step, nb_big_rounds); /* calculates the coordinate where the workers start sieving from */
		mpz_mul_ui(global_step, global_step, (unsigned long) mpi_group_size);
		mpz_mul_ui(global_step, global_step, SIEVING_STEP);
		mpz_mul_ui(global_step, global_step, LOCAL_SIEVING_ROUNDS);
		mpz_add(global_step, global_step, sqrtN);

		mpz_set_ui(relative_start, SIEVING_STEP);
		mpz_mul_ui(relative_start, relative_start, LOCAL_SIEVING_ROUNDS);
		mpz_mul_ui(relative_start, relative_start, (unsigned long) my_rank);
		mpz_add(relative_start, relative_start, global_step);

		mpz_set(sieving_index, relative_start);
		mpz_set(next_sieving_index, relative_start);

		for (sieving_round = 0; sieving_round < LOCAL_SIEVING_ROUNDS; /* each slave sieves for LOCAL_SIEVING_ROUNDS rounds */
		sieving_round++) {
			nb_smooth_per_round = 0;
			mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */
			mpz_set(sieving_index, next_sieving_index);

			if (my_rank == 0) {
				printf("\r");
				printf(
						"\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "",
						mpz_get_str(NULL, 10, sieving_index),
						nb_global_smooth_numbers_found, nb_qr_primes);
				fflush(stdout);
			}

			for (i = 0; i < SIEVING_STEP; i++) {
				mpz_set(x_squared[i].value_x, x);

				mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */
				mpz_sub(x_squared[i].value_x_squared,
						x_squared[i].value_x_squared, N);

				mpz_clear(x_squared[i].factors_vect);
				mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */

				mpz_add_ui(x, x, 1);
			}
			mpz_set(next_sieving_index, x);

//--------------------------------------------------------
// eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth
//--------------------------------------------------------
			for (i = 0; i < nb_qr_primes; i++) {
				mpz_set_ui(p, (unsigned long) primes[i]);
				mpz_set(x, sieving_index);

				/* get the first multiple of p that is directly larger that sieving_index
				 * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2
				 * are also multiples of p */
				get_sieving_start_index(x, x, p, modular_roots[i].root1);
				mpz_set(str, x);
				mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */

				for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
					p_pow = mpz_remove(x_squared[j].value_x_squared,
							x_squared[j].value_x_squared, p); /* eliminate all factors of p */

					if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */
					{
						mpz_setbit(x_squared[j].factors_vect, i);
					}

					if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
						save_smooth_number(x_squared[j]);
						nb_smooth_per_round++;
					}
					/* sieve next element located p steps from here */
				}

				/* same goes for root2 */
				if (modular_roots[i].root2 == modular_roots[i].root1)
					continue;

				mpz_set(x, sieving_index);

				get_sieving_start_index(x, x, p, modular_roots[i].root2);
				mpz_set(str, x);
				mpz_sub(x, x, sieving_index);

				for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
					p_pow = mpz_remove(x_squared[j].value_x_squared,
							x_squared[j].value_x_squared, p);

					if (p_pow & 1) {
						mpz_setbit(x_squared[j].factors_vect, i);
					}

					if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
						save_smooth_number(x_squared[j]);
						nb_smooth_per_round++;
					}
				}
			}
		}

		if (my_rank == 0) /* master gathers smooth numbers from slaves */
		{
			gather_smooth_numbers();
			notify_slaves();
		} else /* slaves send their smooth numbers to master */
		{
			send_smooth_numbers_to_master();
			nb_global_smooth_numbers_found = get_server_notification();
		}

		if (nb_global_smooth_numbers_found >= nb_qr_primes + NB_VECTORS_OFFSET)
			break;

		nb_big_rounds++;
	}

	STOP_TIMER_PRINT_TIME("\nSieving DONE");

	if (my_rank == 0) {
		uint64_t t = 0;

//--------------------------------------------------------
//the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number()
//--------------------------------------------------------
		START_TIMER();
		gauss_elimination(&matrix);
		STOP_TIMER_PRINT_TIME("\nGauss elimination done");

		uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */
		int nb_linear_relations = 0;
		mpz_t linear_relation_z, solution_z;
		mpz_init(linear_relation_z);
		mpz_init(solution_z);

		get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/
		while (mpz_cmp_ui(linear_relation_z, 0) == 0) {
			nb_linear_relations++;
			get_matrix_row(linear_relation_z, &matrix, row_index--);
		}

		PRINT(my_rank, "\tLinear dependent relations found : %d\n",
				nb_linear_relations);

//--------------------------------------------------------
// Factor
//--------------------------------------------------------
		//We use the last linear relation to reconstruct our solution
		START_TIMER();
		PRINT(my_rank, "%s", "\nFactorizing..\n");
		mpz_t solution_X, solution_Y;
		mpz_init(solution_X);
		mpz_init(solution_Y);

		/* we start testing from the first linear relation encountered in the matrix */
		for (j = nb_linear_relations; j > 0; j--) {
			PRINT(my_rank, "Trying %d..\n", nb_linear_relations - j + 1);
			mpz_set_ui(solution_X, 1);
			mpz_set_ui(solution_Y, 1);

			get_identity_row(solution_z, &matrix,
					nb_qr_primes + NB_VECTORS_OFFSET - j + 1);

			for (i = 0; i < nb_qr_primes; i++) {
				if (mpz_tstbit(solution_z, i)) {
					mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x);
					mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */

					mpz_mul(solution_Y, solution_Y,
							smooth_numbers[i].value_x_squared);
					/*TODO: handling huge stuff here, there is no modulo N like in the solution_X case!
					 * eliminate squares as long as you go*/
				}
			}

			mpz_sqrt(solution_Y, solution_Y);
			mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */

			mpz_sub(solution_X, solution_X, solution_Y);

			mpz_gcd(solution_X, solution_X, N);

			if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */
				break;
		}
		mpz_cdiv_q(solution_Y, N, solution_X);

		PRINT(my_rank, "\n>>>>>>>>>>> FACTORED %s =\n",
				mpz_get_str(NULL, 10, N));
		PRINT(
				my_rank,
				"\tFactor 1: %s \n\tFactor 2: %s",
				mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y));

		sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N));
		APPEND_TO_LOG_FILE(s);
		sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s",
				mpz_get_str(NULL, 10, solution_X),
				mpz_get_str(NULL, 10, solution_Y));
		APPEND_TO_LOG_FILE(s);

		gettimeofday(&end_global, NULL);
		timersub(&end_global, &start_global, &elapsed);
		sprintf(s, "****** TOTAL TIME: %.3f ms\n",
				elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
		APPEND_TO_LOG_FILE(s);

		STOP_TIMER_PRINT_TIME("\nFactorizing done");
	}

	PRINT(my_rank, "%s", "\nCleaning memory..\n");

	/********************** clear the x_squared array **********************/
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_clear(x_squared[i].value_x);
		mpz_clear(x_squared[i].value_x_squared);
		//free(x_squared[i].factors_exp);
		mpz_clear(x_squared[i].factors_vect);
	}
	free(x_squared);
	/********************** clear the x_squared array **********************/

	free(modular_roots);
	/********************** clear the smooth_numbers array **********************/
	if (my_rank == 0) {
		for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) {
			mpz_clear(smooth_numbers[i].value_x);
			mpz_clear(smooth_numbers[i].value_x_squared);
			mpz_clear(smooth_numbers[i].factors_vect);
			//free(smooth_numbers[i].factors_exp);
		}
		free(smooth_numbers);
	} else {
		for (i = 0; i < 500; i++) {
			mpz_clear(temp_slaves_smooth_numbers[i].value_x);
			mpz_clear(temp_slaves_smooth_numbers[i].value_x_squared);
			mpz_clear(temp_slaves_smooth_numbers[i].factors_vect);
		}
		free(temp_slaves_smooth_numbers);
	}
	/********************** clear the smooth_numbers array **********************/

	free(primes);
	/********************** clear mpz _t **********************/mpz_clear(B);
	mpz_clear(N);
	sqrtN, rem;
	mpz_clear(x);
	mpz_clear(sieving_index);
	mpz_clear(next_sieving_index);
	mpz_clear(p);
	mpz_clear(str);
	/********************** clear mpz _t **********************/

	free_matrix(&matrix);

	gettimeofday(&end_global, NULL);
	timersub(&end_global, &start_global, &elapsed);
	PRINT(my_rank, "****** TOTAL TIME: %.3f ms\n",
			elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
	show_mem_usage();

	MPI_Finalize();

	return 0;
}
예제 #3
0
파일: main.c 프로젝트: explicite/oww
int main()
{
 
  /*FILE *f;
  char fn[] = "data.txt";
  f = fopen(fn, "w");
  
  if(f == NULL)
  {
    printf("Error: FILE!\n");
    exit(1);
  }*/
  
  
  //TESTING
  //TEST - STANDARD COPY, COMPRESS, DECOMPRESS, MULTIPLICATION
  Matrix* test_mtx_1 = init_matrix(1000, 1000);
  f2(test_mtx_1, 1, 1, 2);
  Matrix* test_mtx_2 = copy_matrix(test_mtx_1);
  test(assert_matrix(test_mtx_1, test_mtx_2), "Copy matrix");
  
  CRS* test_crs_1 = cp_crs(test_mtx_1);
  CCS* test_ccs_1 = cp_ccs(test_mtx_2);
  
  Matrix* test_ucp_mtx_1 = uncp_crs(test_crs_1);
  Matrix* test_ucp_mtx_2 = uncp_ccs(test_ccs_1);
  
  test(assert_matrix(test_ucp_mtx_1, test_mtx_1), "Decompress crs");
  
  test(assert_matrix(test_ucp_mtx_2, test_mtx_2), "Decompress ccs");
  
  test(assert_matrix(test_ucp_mtx_1, test_ucp_mtx_2), "Decompress matrix");
  
  Vector* standard_vector = gen_vector(1000, 0.1, 1);
  Vector* crs_product = mtp_crs(test_crs_1, standard_vector);
  Vector* ccs_product = mtp_ccs(test_ccs_1, standard_vector);
  
  test(assert_vector(crs_product, ccs_product), "CCS and CRS product");
  
  //TEST - CLEAN
  free_matrix(test_mtx_1);
  free_matrix(test_mtx_2);
  free_ccs(test_ccs_1);
  free_crs(test_crs_1);
  free_vector(standard_vector);
  free_vector(crs_product);
  free_vector(ccs_product);
  free_matrix(test_ucp_mtx_1);
  free_matrix(test_ucp_mtx_2);
  
  //TESTING
  //TEST - CRS PARALLEL PRODUCT
  Matrix* test_mtx = init_matrix(100, 100);
  f2(test_mtx, 1, 1, 2);
  
  CRS* standard_test_crs = cp_crs(test_mtx);
  Vector* vector = gen_vector(100, 0.1, 1);
  free_matrix(test_mtx);
  
  Vector* standard_product = mtp_crs(standard_test_crs, vector);
  
  Vector* openmp_product = openmp_mtp_crs(standard_test_crs, vector);
  test(assert_vector(standard_product, openmp_product), "CRS openmp product validation");
  free_vector(openmp_product);
  
  Vector* pthread_product = pthread_mtp_crs(standard_test_crs, vector);
  test(assert_vector(standard_product, pthread_product), "CRS pthread product validation");
  free_vector(pthread_product);
  
  
  //TEST - CLEAN
  //free_matrix(test_mtx);
  free_crs(standard_test_crs);
  free_vector(vector);
  free_vector(standard_product);
  
  /*
  //_________________________________________________________________________________
  
  //TEST - SPEED
  Matrix* mtx_speed = init_matrix(10,10);
  f2(mtx_speed,1,1,2);
  
  CRS* crs_speed = cp_crs(mtx_speed);
  CCS* ccs_speed = cp_ccs(mtx_speed);

  Vector* vector_speed = gen_vector(10, 0.1, 1);

  printf("\nStandard ccs product\n");
  init_stoper();
  Vector* mtp_ccs_product = mtp_ccs(ccs_speed, vector_speed);
  print_stoper();
  
  printf("\nStandard crs product\n");
  init_stoper();
  Vector* mtp_crs_product = mtp_crs(crs_speed, vector_speed);
  print_stoper();
  
  //TEST - PRODUCT VALIDATION
  test(assert_vector(mtp_ccs_product, mtp_crs_product), "ccs and crs product");
  
  //TEST - CLEAN
  free_crs(crs_speed);
  free_ccs(ccs_speed);
  
  //TEST - OPENMP PRODUCT 
  CRS* openmp_crs_speed = cp_crs(mtx_speed);  	
  printf("\nopenmp crs product\n");
  init_stoper();
  Vector* openmp_mtp_crs_product = openmp_mtp_crs(openmp_crs_speed, vector_speed);
  print_stoper();
  
  //TEST - OPENMP PRODUCT VALIDATION
  test(assert_vector(openmp_mtp_crs_product, mtp_crs_product), "openmp product validation");
  
  //TEST - OPENMP CLEAN
  free_crs(openmp_crs_speed);
  free_vector(openmp_mtp_crs_product);
  
  //TEST - PTHREAD PRODUCT
  CRS* pthread_crs_speed = cp_crs(mtx_speed);
  printf("\npthread crs product\n");
  init_stoper();
  Vector* pthread_mtp_crs_product = pthread_mtp_crs(pthread_crs_speed, vector_speed);
  print_stoper();
  
  //TEST - PTHREAD PRODUCT VALIDATION
  test(assert_vector(pthread_mtp_crs_product, mtp_crs_product), "pthread product validation");
  
  //TEST PTHREAD CLEAN
  free_crs(pthread_crs_speed);
  free_vector(pthread_mtp_crs_product);
  
  //TEST - MPI PRODUCT
  CRS* mpi_crs_speed = copy_crs(crs_speed);
  printf("\nmpi crs product\n");
  init_stoper();
  Vector* mpi_mtp_crs_product = mpi_mtp_crs(mpi_crs_speed, vector_speed);
  print_stoper();
  
  //TEST - MPI PRODUCT VALIDATION
  test(assert_vector(mpi_mtp_crs_product, mtp_crs_product), "mpi product validation");
  
  //TEST - MPI CLEAN
  free_crs(mpi_crs_speed);
  free_vector(mpi_mtp_crs_product);
  */
  return 0;
}
예제 #4
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing sswap, sswapblk, slaswp, slaswpx
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    float *h_A1, *h_A2;
    float *h_R1, *h_R2;
    magmaFloat_ptr d_A1, d_A2;
    
    // row-major and column-major performance
    real_Double_t row_perf0 = MAGMA_D_NAN, col_perf0 = MAGMA_D_NAN;
    real_Double_t row_perf1 = MAGMA_D_NAN, col_perf1 = MAGMA_D_NAN;
    real_Double_t row_perf2 = MAGMA_D_NAN, col_perf2 = MAGMA_D_NAN;
    real_Double_t row_perf4 = MAGMA_D_NAN;
    real_Double_t row_perf5 = MAGMA_D_NAN, col_perf5 = MAGMA_D_NAN;
    real_Double_t row_perf6 = MAGMA_D_NAN, col_perf6 = MAGMA_D_NAN;
    real_Double_t row_perf7 = MAGMA_D_NAN;
    real_Double_t cpu_perf  = MAGMA_D_NAN;

    real_Double_t time, gbytes;

    magma_int_t N, lda, ldda, nb, j;
    magma_int_t ione = 1;
    magma_int_t *ipiv, *ipiv2;
    magmaInt_ptr d_ipiv;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    magma_queue_t queue = 0;
    
    printf("            %8s sswap    sswap             sswapblk          slaswp   slaswp2  slaswpx           scopymatrix      CPU      (all in )\n", g_platform_str );
    printf("    N   nb  row-maj/col-maj   row-maj/col-maj   row-maj/col-maj   row-maj  row-maj  row-maj/col-maj   row-blk/col-blk  slaswp   (GByte/s)\n");
    printf("=========================================================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            // For an N x N matrix, swap nb rows or nb columns using various methods.
            // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure.
            // The variable 'shift' keeps track of which bit is for current test
            int shift = 1;
            int check = 0;
            N = opts.nsize[itest];
            lda    = N;
            ldda   = ((N+31)/32)*32;
            nb     = (opts.nb > 0 ? opts.nb : magma_get_sgetrf_nb( N ));
            nb     = min( N, nb );
            // each swap does 2N loads and 2N stores, for nb swaps
            gbytes = sizeof(float) * 4.*N*nb / 1e9;
            
            TESTING_MALLOC_PIN( h_A1, float, lda*N );
            TESTING_MALLOC_PIN( h_A2, float, lda*N );
            TESTING_MALLOC_PIN( h_R1, float, lda*N );
            TESTING_MALLOC_PIN( h_R2, float, lda*N );
            
            TESTING_MALLOC_CPU( ipiv,  magma_int_t, nb );
            TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb );
            
            TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb );
            TESTING_MALLOC_DEV( d_A1, float, ldda*N );
            TESTING_MALLOC_DEV( d_A2, float, ldda*N );
            
            // getrf always makes ipiv[j] >= j+1, where ipiv is one based and j is zero based
            // some implementations (e.g., MacOS dlaswp) assume this
            for( j=0; j < nb; j++ ) {
                ipiv[j] = (rand() % (N-j)) + j + 1;
                assert( ipiv[j] >= j+1 );
                assert( ipiv[j] <= N   );
            }
            
            /* =====================================================================
             * cublas / clBLAS / Xeon Phi sswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    #ifdef HAVE_CUBLAS
                        cublasSswap( opts.handle, N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1 );
                    #else
                        magma_sswap( N, d_A1, ldda*j, 1, d_A2, ldda*(ipiv[j]-1), 1, opts.queue );
                    #endif
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    #ifdef HAVE_CUBLAS
                        cublasSswap( opts.handle, N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda );
                    #else
                        magma_sswap( N, d_A1, j, ldda, d_A2, ipiv[j]-1, ldda, opts.queue );
                    #endif
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * sswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_sswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_sswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda );
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * sswapblk, blocked version (2 matrices)
             */
            
            #ifdef HAVE_CUBLAS
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_sswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            row_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_ssetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_sswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            col_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_sgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            #endif

            /* =====================================================================
             * LAPACK-style slaswp (1 matrix)
             */
            
            #ifdef HAVE_CUBLAS
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswp( N, d_A1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf4 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            #endif

            /* =====================================================================
             * LAPACK-style slaswp (1 matrix) - d_ipiv on GPU
             */
            
            #ifdef HAVE_CUBLAS
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 );
            magmablas_slaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 );
            time = magma_sync_wtime( queue ) - time;
            row_perf7 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            #endif

            /* =====================================================================
             * LAPACK-style slaswpx (extended for row- and col-major) (1 matrix)
             */
            
            #ifdef HAVE_CUBLAS
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf5 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_sswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* Col Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_ssetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_slaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            col_perf5 = gbytes / time;
            #endif
            
            /* LAPACK swap on CPU for comparison */
            time = magma_wtime();
            lapackf77_slaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione);
            time = magma_wtime() - time;
            cpu_perf = gbytes / time;
            
            #ifdef HAVE_CUBLAS
            magma_sgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            #endif

            /* =====================================================================
             * Copy matrix.
             */
            
            time = magma_sync_wtime( queue );
            magma_scopymatrix( N, nb, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            col_perf6 = 0.5 * gbytes / time;
            
            time = magma_sync_wtime( queue );
            magma_scopymatrix( nb, N, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            row_perf6 = 0.5 * gbytes / time;

            printf("%5d  %3d  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f / %6.2f  %6.2f  %10s\n",
                   (int) N, (int) nb,
                   row_perf0, ((check & 0x001) != 0 ? '*' : ' '),
                   col_perf0, ((check & 0x002) != 0 ? '*' : ' '),
                   row_perf1, ((check & 0x004) != 0 ? '*' : ' '),
                   col_perf1, ((check & 0x008) != 0 ? '*' : ' '),
                   row_perf2, ((check & 0x010) != 0 ? '*' : ' '),
                   col_perf2, ((check & 0x020) != 0 ? '*' : ' '),
                   row_perf4, ((check & 0x040) != 0 ? '*' : ' '),
                   row_perf7, ((check & 0x080) != 0 ? '*' : ' '),
                   row_perf5, ((check & 0x100) != 0 ? '*' : ' '),
                   col_perf5, ((check & 0x200) != 0 ? '*' : ' '),
                   row_perf6,
                   col_perf6,
                   cpu_perf,
                   (check == 0 ? "ok" : "* failed") );
            status += ! (check == 0);
            
            TESTING_FREE_PIN( h_A1 );
            TESTING_FREE_PIN( h_A2 );
            TESTING_FREE_PIN( h_R1 );
            TESTING_FREE_PIN( h_R2 );
            
            TESTING_FREE_CPU( ipiv  );
            TESTING_FREE_CPU( ipiv2 );
            
            TESTING_FREE_DEV( d_ipiv );
            TESTING_FREE_DEV( d_A1 );
            TESTING_FREE_DEV( d_A2 );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
예제 #5
0
파일: main.c 프로젝트: atommed/OP_repo
void eval(char *cmd) {
  if (cmd[0] == '\0')
    return;
  char *af = strchr(cmd, ' ');
  if (af != NULL)
    *af = '\0';
  if (strcmp(cmd, "init") == 0) {
    int r, c;
    if (af != NULL)
      if (sscanf(af + 1, "%d %d", &r, &c) == 2) {
        init_matrix(r, c);
        return;
      }

    wprintw(work_wnd, "Usage: init nrows ncolumns\n");
    return;
  }
  if (strcmp(cmd, "randomize") == 0) {
    int l, h;
    if (af != NULL)
      if (sscanf(af + 1, "%d %d", &l, &h) == 2) {
        randomize(l, h);
        return;
      }

    wprintw(work_wnd, "Usage: randomize min max\n");
    return;
  }
  if (strcmp(cmd, "mutate") == 0) {
    int r, c, nv;
    if (af != NULL)
      if (sscanf(af + 1, "%d %d %d", &r, &c, &nv) == 3) {
        mutate(r, c, nv);
        return;
      }
    wprintw(work_wnd, "Usage: mutate row column new_value\n");
    return;
  }
  if (strcmp(cmd, "null") == 0) {
    tonull();
    return;
  }
  if (strcmp(cmd, "sumDown") == 0) {
    sumDown();
    return;
  }
  if (strcmp(cmd, "reflectSide") == 0) {
    transposeSide();
    return;
  }
  if (strcmp(cmd, "rotate") == 0) {
    rotate();
    return;
  }
  if (strcmp(cmd, "flipH") == 0) {
    flipH();
    return;
  }
  if (strcmp(cmd, "avg") == 0) {
    avg();
    return;
  }
  if (strcmp(cmd, "feswap") == 0) {
    feswap();
    return;
  }
  if (strcmp(cmd, "leswap") == 0) {
    leswap();
    return;
  }
  if (strcmp(cmd, "ecswap") == 0) {
    ecswap();
    return;
  }
  if (strcmp(cmd, "q") == 0) {
    finish();
    exit(EXIT_SUCCESS);
    return;
  }
  if (strcmp(cmd, "sumC") == 0) {
    size_t column;
    if (af != NULL)
      if (sscanf(af + 1, "%zu", &column) == 1) {
        sumColumn(column);
        return;
      }
    wprintw(work_wnd, "Usage: sumC column\n");
    return;
  }
  print_help();
}
int main(int argc, char **argv)
{
    pthread_t thr[COLS1];
    pthread_t sumThread;
    init_matrix();

    // Barrier initialization
    if(pthread_barrier_init(&barr, NULL, COLS1+1)){
        printf("Could not create a barrier\n");
        return -1;
    }

    int i;
    for(i = 0; i < COLS1; ++i){
        if(pthread_create(&thr[i], NULL, &multiplica, (void*)i)){
            printf("Could not create thread %d\n", i);
            return -1;
        }
    }
    if(pthread_create(&sumThread, NULL, &soma, NULL)){
            printf("Could not create thread %d\n", i);
            return -1;
    }
    

    for(i = 0; i < COLS1; ++i){
        if(pthread_join(thr[i], NULL)){
            printf("Could not join thread %d\n", i);
            return -1;
        }
    }
    if(pthread_join(sumThread, NULL)){
            printf("Could not join thread %d\n", i);
            return -1;
    }

    int r,c;
    printf("Matriz 1:\n");
    
    for(r=0;r<ROWS1;r++){
       for(c=0;c<COLS1;c++){
         printf("%d ",matrix1[r][c]);   
      
       }
       printf("\n");   
    }

    printf("Matriz 2:\n");
    for(r=0;r<ROWS2;r++){
      for(c=0;c<COLS2;c++){
          printf("%d ",matrix2[r][c]);   
       }
       printf("\n");   
    }
    printf("Matriz Final:\n");
    for(r=0;r<ROWS1;r++){
      for(c=0;c<COLS2;c++){
         printf("%d ",matrix_final[r][c]);   
       }
       printf("\n");   
    }
    printf("TERMINOU!\n");
    return 0;
}
예제 #7
0
파일: main.c 프로젝트: pxlpnk/ipc_2012
int main(int argc, char** argv) {
  int rank, size;
  int N;
  char opt;
  int nt = -1;
  int max_threads = 16; // on jupiter

  bool id = false;


  algo_t algo = reduce_scatter;
  FILE *f = NULL;

  static const char optstring[] = "n:a:f:i:p:";
  static const struct option long_options[] = {
		{"n",			1, NULL, 'n'},
    {"file",		1, NULL, 'f'},
    {"i",			1, NULL, 'i'},
		{NULL,			0, NULL, 0}
  };


  MPI_Init(&argc,&argv);

  // get rank and size from communicator
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);

	while ((opt = getopt_long(argc, argv, optstring, long_options, NULL)) != EOF) {
    switch(opt) {
    case 'i':
      if (strcmp("procs", optarg) == 0) {
        id = true;
      }
      break;
    case 'p':
      nt = atoi(optarg);
      if (nt > max_threads) {
        printf("Using too much procs %d, use max %d", nt, max_threads);
        return EXIT_FAILURE;
      } else {
        printf("Using %d procs.", nt);
      }

    case 'n':
      N = atoi(optarg);
      break;    case 'f':
			f = fopen(optarg,"a");
			if (f == NULL) {
				mpi_printf(root, "Could not open log file '%s': %s\n", optarg, strerror(errno));
        MPI_Finalize();
				return  EXIT_FAILURE;
			}
			break;
    case 'a':
      if (strcmp("ref", optarg) == 0) {
        mpi_printf(root, "Using reference implementation \n");
        algo = ref;
      } else if ((strcmp("reduce_scatter", optarg) == 0)) {
        mpi_printf(root, "Using MPI_Allgather implementation \n");
        algo = reduce_scatter;
      }
      break;
    default:
      MPI_Finalize();
      return  EXIT_FAILURE;
    }
  }

  if(N == 0) {
		if ( rank == root ){
      printf("Usage: mpirun -nn nodecount p3-reduce_scatter.exe -n N\n");
      printf("N is the the matrix size. \n\n");
		}
		return 1;
  }


  /* ======================================================== */
  /* Initialisation matrix & vector */

  ATYPE *matrix = NULL;
  ATYPE *vector = NULL;


  if (rank == root) {
    debug("Setting up root data structures");
    matrix = init_matrix(N,1);
    vector = init_vector(N,1);
  }

  int colcnt =  N - (N/size ) * (size - 1 );
  int partition = N/size;

  ATYPE *local_matrix = NULL;
  local_matrix = (ATYPE*) malloc (sizeof(ATYPE) * N * colcnt);


  ATYPE *local_vector = NULL;
  local_vector = (ATYPE*) malloc (sizeof(ATYPE) * partition) ;


  ATYPE *reference = NULL;
  reference = init_vector(N,1);

  ATYPE *result = NULL;
  result = init_vector(N,1);

  double       inittime,totaltime;

  if( algo == ref) {
    if (rank == root) {
      inittime = MPI_Wtime();
      matrix_vector_mult_ref(matrix, vector, N, reference);
      totaltime = MPI_Wtime() - inittime;
    }
  } else if (algo == reduce_scatter) {

    if(rank == root){
      debug("Comptuting reference");
      matrix_vector_mult_ref(matrix, vector, N, reference);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* ======================================================== */
    /* distributing matrix and vector */


    distribute_vector(vector, local_vector, rank, size, partition, N);
    distribute_matrix(matrix, local_matrix, rank, size, partition, N);


    debug("begin MPI_Reduce_scatter");
    MPI_Barrier(MPI_COMM_WORLD);
    inittime = MPI_Wtime();
    compute_reduce_scatter(local_matrix, local_vector, result, rank, size, N, partition);

    MPI_Barrier(MPI_COMM_WORLD);

    totaltime = MPI_Wtime() - inittime;
    double localtime = totaltime;

    MPI_Reduce(&localtime, &totaltime, 1, MPI_DOUBLE, MPI_MAX, root,  MPI_COMM_WORLD);

    debug("after MPI_Reduce_scatter");
  /* TODO: fix test so it uses vector idea  */
    /* debug("Testing result"); */
    /* if (test_vector_part(result, local_vector, (rank * partition) , partition)) { */
    /*   debug("testresult: OK"); */
    /* } else { */
    /*   debug("testresult: FAILURE"); */
    /*   debug("Result:"); */
    /*   printArray(recvbuff, N); */
    /*   debug("Reference:"); */
    /*   printArray(reference,N); */
    /* } */

    MPI_Barrier(MPI_COMM_WORLD);
  }

  if (rank == 0) {
    if (f != NULL) {
      if (id) {
        fprintf(f,"%d,%lf\n",nt, totaltime);
      } else {
        fprintf(f,"%d,%lf\n",N, totaltime);
      }
    }
    if (id) {
      printf("%d,%lf\n",nt , totaltime);
    } else {
      printf("%d,%lf\n",N , totaltime);
    }
  }

  debug("cleaning up");


  free(vector);

  free(matrix);

  MPI_Finalize();

  if ( f != NULL) {
    fclose(f);
  }
  return 0;
}
예제 #8
0
static int test_transform_function( transform_func func, int psize,
                                    int mtype, unsigned long *cycles )
{
    GLvector4f source[1], dest[1], ref[1];
    GLmatrix mat[1];
    GLfloat *m;
    int i, j;
#ifdef  RUN_DEBUG_BENCHMARK
    int cycle_i;                /* the counter for the benchmarks we run */
#endif

    (void) cycles;

    if ( psize > 4 ) {
        _mesa_problem( NULL, "test_transform_function called with psize > 4\n" );
        return 0;
    }

    mat->m = (GLfloat *) _mesa_align_malloc( 16 * sizeof(GLfloat), 16 );
    mat->type = mtypes[mtype];

    m = mat->m;
    ASSERT( ((long)m & 15) == 0 );

    init_matrix( m );

    for ( i = 0 ; i < 4 ; i++ ) {
        for ( j = 0 ; j < 4 ; j++ ) {
            switch ( templates[mtype][i * 4 + j] ) {
            case NIL:
                m[j * 4 + i] = 0.0;
                break;
            case ONE:
                m[j * 4 + i] = 1.0;
                break;
            case NEG:
                m[j * 4 + i] = -1.0;
                break;
            case VAR:
                break;
            default:
                ASSERT(0);
                return 0;
            }
        }
    }

    for ( i = 0 ; i < TEST_COUNT ; i++) {
        ASSIGN_4V( d[i], 0.0, 0.0, 0.0, 1.0 );
        ASSIGN_4V( s[i], 0.0, 0.0, 0.0, 1.0 );
        for ( j = 0 ; j < psize ; j++ )
            s[i][j] = rnd();
    }

    source->data = (GLfloat(*)[4])s;
    source->start = (GLfloat *)s;
    source->count = TEST_COUNT;
    source->stride = sizeof(s[0]);
    source->size = 4;
    source->flags = 0;

    dest->data = (GLfloat(*)[4])d;
    dest->start = (GLfloat *)d;
    dest->count = TEST_COUNT;
    dest->stride = sizeof(float[4]);
    dest->size = 0;
    dest->flags = 0;

    ref->data = (GLfloat(*)[4])r;
    ref->start = (GLfloat *)r;
    ref->count = TEST_COUNT;
    ref->stride = sizeof(float[4]);
    ref->size = 0;
    ref->flags = 0;

    ref_transform( ref, mat, source );

    if ( mesa_profile ) {
        BEGIN_RACE( *cycles );
        func( dest, mat->m, source );
        END_RACE( *cycles );
    }
    else {
        func( dest, mat->m, source );
    }

    for ( i = 0 ; i < TEST_COUNT ; i++ ) {
        for ( j = 0 ; j < 4 ; j++ ) {
            if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) {
                printf("-----------------------------\n" );
                printf("(i = %i, j = %i)\n", i, j );
                printf("%f \t %f \t [diff = %e - %i bit missed]\n",
                       d[i][0], r[i][0], r[i][0]-d[i][0],
                       MAX_PRECISION - significand_match( d[i][0], r[i][0] ) );
                printf("%f \t %f \t [diff = %e - %i bit missed]\n",
                       d[i][1], r[i][1], r[i][1]-d[i][1],
                       MAX_PRECISION - significand_match( d[i][1], r[i][1] ) );
                printf("%f \t %f \t [diff = %e - %i bit missed]\n",
                       d[i][2], r[i][2], r[i][2]-d[i][2],
                       MAX_PRECISION - significand_match( d[i][2], r[i][2] ) );
                printf("%f \t %f \t [diff = %e - %i bit missed]\n",
                       d[i][3], r[i][3], r[i][3]-d[i][3],
                       MAX_PRECISION - significand_match( d[i][3], r[i][3] ) );
                return 0;
            }
        }
    }

    _mesa_align_free( mat->m );
    return 1;
}
예제 #9
0
static int test_norm_function( normal_func func, int mtype, long *cycles )
{
   GLvector4f source[1], dest[1], dest2[1], ref[1], ref2[1];
   GLmatrix mat[1];
   GLfloat s[TEST_COUNT][5], d[TEST_COUNT][4], r[TEST_COUNT][4];
   GLfloat d2[TEST_COUNT][4], r2[TEST_COUNT][4], length[TEST_COUNT];
   GLfloat scale;
   GLfloat *m;
   int i, j;
#ifdef  RUN_DEBUG_BENCHMARK
   int cycle_i;		/* the counter for the benchmarks we run */
#endif

   (void) cycles;

   mat->m = (GLfloat *) ALIGN_MALLOC( 16 * sizeof(GLfloat), 16 );
   mat->inv = m = mat->m;

   init_matrix( m );

   scale = 1.0F + rnd () * norm_scale_types[mtype];

   for ( i = 0 ; i < 4 ; i++ ) {
      for ( j = 0 ; j < 4 ; j++ ) {
         switch ( norm_templates[mtype][i * 4 + j] ) {
         case NIL:
            m[j * 4 + i] = 0.0;
            break;
         case ONE:
            m[j * 4 + i] = 1.0;
            break;
         case NEG:
            m[j * 4 + i] = -1.0;
            break;
         case VAR:
            break;
         default:
            _mesa_exit(1);
         }
      }
   }

   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
      ASSIGN_3V( d[i],  0.0, 0.0, 0.0 );
      ASSIGN_3V( s[i],  0.0, 0.0, 0.0 );
      ASSIGN_3V( d2[i], 0.0, 0.0, 0.0 );
      for ( j = 0 ; j < 3 ; j++ )
         s[i][j] = rnd();
      length[i] = 1 / SQRTF( LEN_SQUARED_3FV( s[i] ) );
   }

   source->data = (GLfloat(*)[4]) s;
   source->start = (GLfloat *) s;
   source->count = TEST_COUNT;
   source->stride = sizeof(s[0]);
   source->flags = 0;

   dest->data = d;
   dest->start = (GLfloat *) d;
   dest->count = TEST_COUNT;
   dest->stride = sizeof(float[4]);
   dest->flags = 0;

   dest2->data = d2;
   dest2->start = (GLfloat *) d2;
   dest2->count = TEST_COUNT;
   dest2->stride = sizeof(float[4]);
   dest2->flags = 0;

   ref->data = r;
   ref->start = (GLfloat *) r;
   ref->count = TEST_COUNT;
   ref->stride = sizeof(float[4]);
   ref->flags = 0;

   ref2->data = r2;
   ref2->start = (GLfloat *) r2;
   ref2->count = TEST_COUNT;
   ref2->stride = sizeof(float[4]);
   ref2->flags = 0;

   if ( norm_normalize_types[mtype] == 0 ) {
      ref_norm_transform_rescale( mat, scale, source, NULL, ref );
   } else {
      ref_norm_transform_normalize( mat, scale, source, NULL, ref );
      ref_norm_transform_normalize( mat, scale, source, length, ref2 );
   }

   if ( mesa_profile ) {
      BEGIN_RACE( *cycles );
      func( mat, scale, source, NULL, dest );
      END_RACE( *cycles );
      func( mat, scale, source, length, dest2 );
   } else {
      func( mat, scale, source, NULL, dest );
      func( mat, scale, source, length, dest2 );
   }

   for ( i = 0 ; i < TEST_COUNT ; i++ ) {
      for ( j = 0 ; j < 3 ; j++ ) {
         if ( significand_match( d[i][j], r[i][j] ) < REQUIRED_PRECISION ) {
            _mesa_printf( "-----------------------------\n" );
            _mesa_printf( "(i = %i, j = %i)\n", i, j );
            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		    d[i][0], r[i][0], r[i][0]/d[i][0],
		    MAX_PRECISION - significand_match( d[i][0], r[i][0] ) );
            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		    d[i][1], r[i][1], r[i][1]/d[i][1],
		    MAX_PRECISION - significand_match( d[i][1], r[i][1] ) );
            _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		    d[i][2], r[i][2], r[i][2]/d[i][2],
		    MAX_PRECISION - significand_match( d[i][2], r[i][2] ) );
            return 0;
         }

         if ( norm_normalize_types[mtype] != 0 ) {
            if ( significand_match( d2[i][j], r2[i][j] ) < REQUIRED_PRECISION ) {
               _mesa_printf( "------------------- precalculated length case ------\n" );
               _mesa_printf( "(i = %i, j = %i)\n", i, j );
               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		       d2[i][0], r2[i][0], r2[i][0]/d2[i][0],
		       MAX_PRECISION - significand_match( d2[i][0], r2[i][0] ) );
               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		       d2[i][1], r2[i][1], r2[i][1]/d2[i][1],
		       MAX_PRECISION - significand_match( d2[i][1], r2[i][1] ) );
               _mesa_printf( "%f \t %f \t [ratio = %e - %i bit missed]\n",
		       d2[i][2], r2[i][2], r2[i][2]/d2[i][2],
		       MAX_PRECISION - significand_match( d2[i][2], r2[i][2] ) );
               return 0;
            }
         }
      }
   }

   ALIGN_FREE( mat->m );
   return 1;
}
예제 #10
0
VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLgemm(ViennaCLHostScalar alpha, ViennaCLMatrix A, ViennaCLMatrix B, ViennaCLHostScalar beta, ViennaCLMatrix C)
{
  viennacl::backend::mem_handle A_handle;
  viennacl::backend::mem_handle B_handle;
  viennacl::backend::mem_handle C_handle;

  if (init_matrix(A_handle, A) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  if (init_matrix(B_handle, B) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  if (init_matrix(C_handle, C) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  switch (A->precision)
  {
    case ViennaCLFloat:
    {
      typedef viennacl::matrix_base<float>::size_type           size_type;
      typedef viennacl::matrix_base<float>::difference_type     difference_type;

      viennacl::matrix_base<float> mat_A(A_handle,
                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
      viennacl::matrix_base<float> mat_B(B_handle,
                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
      viennacl::matrix_base<float> mat_C(C_handle,
                                         size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
                                         size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);

      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_float, beta->value_float);
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_float, beta->value_float);
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_float, beta->value_float);
      else
        return ViennaCLGenericFailure;

      return ViennaCLSuccess;
    }

    case ViennaCLDouble:
    {
      typedef viennacl::matrix_base<double>::size_type           size_type;
      typedef viennacl::matrix_base<double>::difference_type     difference_type;

      viennacl::matrix_base<double> mat_A(A_handle,
                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
      viennacl::matrix_base<double> mat_B(B_handle,
                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);
      viennacl::matrix_base<double> mat_C(C_handle,
                                          size_type(C->size1), size_type(C->start1), difference_type(C->stride1), size_type(C->internal_size1),
                                          size_type(C->size2), size_type(C->start2), difference_type(C->stride2), size_type(C->internal_size2), C->order == ViennaCLRowMajor);

      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
        viennacl::linalg::prod_impl(viennacl::trans(mat_A), viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
        viennacl::linalg::prod_impl(viennacl::trans(mat_A), mat_B, mat_C, alpha->value_double, beta->value_double);
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
        viennacl::linalg::prod_impl(mat_A, viennacl::trans(mat_B), mat_C, alpha->value_double, beta->value_double);
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
        viennacl::linalg::prod_impl(mat_A, mat_B, mat_C, alpha->value_double, beta->value_double);
      else
        return ViennaCLGenericFailure;

      return ViennaCLSuccess;
    }

    default:
      return ViennaCLGenericFailure;
  }
}
예제 #11
0
int main(int argc, char **argv)
{
	int rank;
	int world_size;

	/*
	 *	Initialization
	 */
	int thread_support;
	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
		fprintf(stderr,"MPI_Init_thread failed\n");
		exit(1);
	}
	if (thread_support == MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
	if (thread_support < MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI does not have thread support!\n");

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);

	starpu_srand48((long int)time(NULL));

	parse_args(rank, argc, argv);

	int ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	/* We disable sequential consistency in this example */
	starpu_data_set_default_sequential_consistency_flag(0);

	starpu_mpi_init(NULL, NULL, 0);

	STARPU_ASSERT(p*q == world_size);

	starpu_cublas_init();

	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	/*
	 * 	Problem Init
	 */

	init_matrix(rank);

	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
                        (int)(allocated_memory/(1024*1024)),
			(int)(allocated_memory_extra/(1024*1024)),
                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));

	display_grid(rank, nblocks);

	TYPE *a_r = NULL;
//	STARPU_PLU(display_data_content)(a_r, size);

	TYPE *x, *y;

	if (check)
	{
		x = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(x);

		y = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			unsigned ind;
			for (ind = 0; ind < size; ind++)
				x[ind] = (TYPE)starpu_drand48();
		}

		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);

		if (rank == 0)
			STARPU_PLU(display_data_content)(a_r, size);

//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
	}

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

	/*
	 * 	Report performance
	 */

	int reduce_ret;
	double min_timing = timing;
	double max_timing = timing;
	double sum_timing = timing;

	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	if (rank == 0)
	{
		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));

		unsigned n = size;
		double flop = (2.0f*n*n*n)/3.0f;
		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
	}

	/*
	 *	Test Result Correctness
	 */

	if (check)
	{
		/*
		 *	Compute || A - LU ||
		 */

		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);

#if 0
		/*
		 *	Compute || Ax - LUx ||
		 */

		unsigned ind;

		y2 = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			for (ind = 0; ind < size; ind++)
			{
				y2[ind] = (TYPE)0.0;
			}
		}

		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);

		/* Compute y2 = y2 - y */
		CPU_AXPY(size, -1.0, y, 1, y2, 1);

		TYPE err = CPU_ASUM(size, y2, 1);
		int max = CPU_IAMAX(size, y2, 1);

		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
#endif
	}

	/*
	 * 	Termination
	 */

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	starpu_cublas_shutdown();
	starpu_mpi_shutdown();
	starpu_shutdown();

#if 0
	MPI_Finalize();
#endif

	return 0;
}
예제 #12
0
VIENNACL_EXPORTED_FUNCTION ViennaCLStatus ViennaCLtrsm(ViennaCLMatrix A, ViennaCLUplo uplo, ViennaCLDiag diag, ViennaCLMatrix B)
{
  viennacl::backend::mem_handle A_handle;
  viennacl::backend::mem_handle B_handle;

  if (init_matrix(A_handle, A) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  if (init_matrix(B_handle, B) != ViennaCLSuccess)
    return ViennaCLGenericFailure;

  switch (A->precision)
  {
    case ViennaCLFloat:
    {
      typedef viennacl::matrix_base<float>::size_type           size_type;
      typedef viennacl::matrix_base<float>::difference_type     difference_type;

      viennacl::matrix_base<float> mat_A(A_handle,
                                         size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
                                         size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
      viennacl::matrix_base<float> mat_B(B_handle,
                                         size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
                                         size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);

      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }

      return ViennaCLSuccess;
    }
    case ViennaCLDouble:
    {
      typedef viennacl::matrix_base<double>::size_type           size_type;
      typedef viennacl::matrix_base<double>::difference_type     difference_type;

      viennacl::matrix_base<double> mat_A(A_handle,
                                          size_type(A->size1), size_type(A->start1), difference_type(A->stride1), size_type(A->internal_size1),
                                          size_type(A->size2), size_type(A->start2), difference_type(A->stride2), size_type(A->internal_size2), A->order == ViennaCLRowMajor);
      viennacl::matrix_base<double> mat_B(B_handle,
                                          size_type(B->size1), size_type(B->start1), difference_type(B->stride1), size_type(B->internal_size1),
                                          size_type(B->size2), size_type(B->start2), difference_type(B->stride2), size_type(B->internal_size2), B->order == ViennaCLRowMajor);

      if (A->trans == ViennaCLTrans && B->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLTrans && B->trans == ViennaCLNoTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), mat_B, viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(viennacl::trans(mat_A), viennacl::trans(mat_B), viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }
      else if (A->trans == ViennaCLNoTrans && B->trans == ViennaCLNoTrans)
      {
        if (uplo == ViennaCLUpper && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::upper_tag());
        else if (uplo == ViennaCLUpper && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_upper_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLNonUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::lower_tag());
        else if (uplo == ViennaCLLower && diag == ViennaCLUnit)
          viennacl::linalg::inplace_solve(mat_A, mat_B, viennacl::linalg::unit_lower_tag());
        else
          return ViennaCLGenericFailure;
      }

      return ViennaCLSuccess;
    }

    default:
      return  ViennaCLGenericFailure;
  }
}
예제 #13
0
파일: test.cpp 프로젝트: chilumanxi/Study
int main(void)
{
	crosslist one,two,three;
    int choice;//as a mark of selection	
    char flag;//selection mark
    while(1)
	{
		system("cls");
        system("color 81");
        system("mode con cols=80 lines=400");
        system("title #Crosslist To Deal With Sparse Matrix#");
        printf("\t@*************************************************************@\n");
        putchar('\n');	
        printf("\t\t %c----------稀疏矩阵-应用程序系统----------%c\n",2,2);
        putchar('\n');
        printf("\t@*************************************************************@\n");	
        printf("\t$*************************************************************$\n");
        printf("\t\t %c----------------功能选择-----------------%c\n",2,2);
        putchar('\n');	
        printf("\t\t %c-----------------------------------------%c\n",2,2);
        printf("\t\t %c <1> 稀疏矩阵的加法运算 %c\n",2,2);	
        printf("\t\t %c-----------------------------------------%c\n",2,2);
        printf("\t\t %c <2> 稀疏矩阵的减法运算 %c\n",2,2);	
        printf("\t\t %c-----------------------------------------%c\n",2,2);
        printf("\t\t %c <3> 稀疏矩阵的乘法运算 %c\n",2,2);	
        printf("\t\t %c-----------------------------------------%c\n",2,2);
        printf("\t\t %c <4> 退出应用程序 %c\n",2,2);	
        printf("\t\t %c-----------------------------------------%c\n",2,2);
        putchar('\n');
        printf("\t\t %c-----------矩阵以行序为主序-----------%c\n",2,2);	
        printf("\t$*************************************************************$\n");
        printf("\t\t!!注意:如果想终止程序,请按 Ctrl +C\n");	
        printf("\t$*************************************************************$\n\n");
        printf("请输入你的选择:(1--4)\n");
        fflush(stdin);//清空输入缓冲区
        printf("你的选择是:");
        scanf("%d",&choice);
	    putchar('\n');
        switch(choice)
        {
		case 1: printf("\t<加法运算>\n");	
			putchar('\n');

			init_matrix(one);//初始化矩阵one

			printf("\t<建立第一个矩阵>\n");
			creat_matrix(one);
			putchar('\n');
			printf("第一个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(one);
			printf("-------------------------------------------------\n");

			init_matrix(two);//初始化矩阵two
			putchar('\n');
			printf("\t<建立第二个矩阵>\n");
			creat_matrix(two);
			putchar('\n');
			printf("第二个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(two);
			printf("-------------------------------------------------\n");
			
			/*add the two matrix*/
	        putchar('\n');
			printf("两个矩阵相加\n");

			init_matrix(three);//初始化矩阵three
			putchar('\n');

			add_matrix(one,two,three);
			printf("结果如下:\n");
			printf("-------------------------------------------------\n");
			Sleep(1000);
			print_matrix(three);
			printf("-------------------------------------------------\n");
			system("pause");
			break;

		case 2: printf("\t<减法运算>\n");	
			putchar('\n');

			init_matrix(one);//初始化矩阵one

			printf("\t<建立第一个矩阵>\n");
			creat_matrix(one);
			putchar('\n');
			printf("第一个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(one);
			printf("-------------------------------------------------\n");

			init_matrix(two);//初始化矩阵two
			putchar('\n');
			printf("\t<建立第二个矩阵>\n");
			creat_matrix(two);
			putchar('\n');
			printf("第二个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(two);	
			printf("-------------------------------------------------\n");

			/*add the two matrix*/
         	putchar('\n');
			printf("两个矩阵相减\n");

			init_matrix(three);//初始化矩阵three
			putchar('\n');
            opposite_matrix(two);
			add_matrix(one,two,three);
			printf("结果如下:\n");
			printf("-------------------------------------------------\n");
			Sleep(1000);
			print_matrix(three);
			printf("-------------------------------------------------\n");
			system("pause");
			break;

		case 3: printf("\t<乘法运算>\n");
            putchar('\n');

			init_matrix(one);//初始化矩阵one
			putchar('\n');

			printf("\t<建立第一个矩阵>\n");
			creat_matrix(one);
			putchar('\n');
			printf("第一个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(one);
			printf("-------------------------------------------------\n");

			init_matrix(two);//初始化矩阵two
			putchar('\n');
			printf("\t<建立第二个矩阵>\n");
			creat_matrix(two);
			putchar('\n');
			printf("第二个矩阵如下:\n");

			printf("-------------------------------------------------\n");
			print_matrix(two);
			printf("-------------------------------------------------\n");		

            /*multiply the two matrix*/
         	putchar('\n');
            printf("两个矩阵相乘\n");
            init_matrix(three);
            multi_matrix(one,two,three);
            printf("结果如下:\n");
			printf("-------------------------------------------------\n");
            Sleep(1000);
            print_matrix(three);
			printf("-------------------------------------------------\n");
            system("pause");
            break;

		case 4:	printf("你确定退出程序吗<Y/N>?\n");
            fflush(stdin);
            scanf("%c",&flag);
            if(flag=='y'||flag=='Y'||flag=='\n')
			{
                printf("\t\t%c-------%c-------%c-------%c-------%c\n",2,2,2,2,2);
                putchar('\n');
                printf("\t\t(^_^)谢谢使用!(^_^)\n");	
                putchar('\n');
                printf("\t\t%c-------%c-------%c-------%c-------%c\n",2,2,2,2,2);
                putchar('\n');
                Sleep(2000);
                exit(1);
			}
            else  
			{
                printf("………欢迎继续使用………\n");
                Sleep(2000);
			}
            break;

		default:printf("请输入有效的选择 1 ~ 4!\n");
            Sleep(2000);
            break;
       }//switch
    }//while	
    return 1;
}
예제 #14
0
int main(int argc, char * argv[])
{
    pthread_t *tid;//number of thread
    args *arg;
    int total_processes;
    double *a, *b, *x;
    int res1, res2;
    long int t;
    int n;
    int N;
    int i;
    char * filename = 0;
    const char * name = "c.txt";
    if( argc != 3 && argc != 4 )
    {
        printf("Usage : %s <n> <total_processes> <filename>\n", argv[0]);
        return 0;
    }
    n = atoi(argv[1]);//from number to string
    total_processes = atoi (argv[2]);
    if(!n || !total_processes)
    {
        printf("Usage : %s <n> <total_processes> <filename>\n", argv[0]);
        return 0;
    }
    a = new double[n*n];
    b = new double[n];
    x = new double[n];
    tid = new pthread_t[total_processes];
    arg = new args[total_processes];
    if(argc > 3)
        filename = argv[3];

    if(filename)
    {
        res1 = read_matrix(a, n, "a.txt");
        res2 = read_vector(b, n, "b.txt");
        if(res1 || res2)
        {
            printf("cannot read from file\n");
            delete [] tid;
            delete [] arg;
            delete [] a;
            delete [] b;
            delete [] x;
            return 1;
        }
    }
    else
    {
        init_matrix(a, n);
        init_vector(b, a, n);
    }
    printf("matrix A:\n");
    print_matrix(a, n);
    printf("vector b:\n");
    print_vector(b, n);

    for (i = 0; i < total_processes; i++)
    {
        arg[i].a = a;
        arg[i].b = b;
        arg[i].n = n;
        arg[i].total_processes = total_processes;
        arg[i].num_process = i;
        arg[i].error = 0;
    }

    t = get_full_time ();
    for (i = 0; i < total_processes; i++)
    {
        if (pthread_create (tid + i, 0, &thread_method_of_reflections, arg + i))
        {
            printf ("Cannot create thread %d\n", i);
            return 2;
        }
    }
    for (i = 0; i < total_processes; i++)
        pthread_join (tid[i], 0);
    back_hod(a, b, x, n);
    t = get_full_time () - t;
    N = (n < MAX_N) ? n : MAX_N;
    printf("result : ");
    for(i = 0; i < N; i++)
        printf("%lg ", x[i]);
	printvectorfile(x,n,name);
     if(filename)
    {
        read_matrix(a, n, "a.txt");
        read_vector(b, n, "b.txt");
        printf("\nResidual = %le\nElapsed time = %Lg\n",SolutionError(n,a,b,x),(long double)t/(CLOCKS_PER_SEC));
    }
    else
    {
        init_matrix(a, n);
        init_vector(b, a, n);
        printf("\nResidual = %le\nError = %le\nElapsed time = %Lg\n",SolutionError(n,a,b,x), SolutionAccuracy(n,x),(long double)t/(CLOCKS_PER_SEC));
    }
    delete [] tid;
    delete [] arg;
    delete [] a;
    delete [] b;
    delete [] x;
    return 0;
}
예제 #15
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dgetrf_mgpu
*/
int main( int argc, char** argv )
{
    TESTING_INIT();

    real_Double_t    gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    double           error;
    double *h_A;
    magmaDouble_ptr d_lA[ MagmaMaxGPUs ];
    magma_int_t *ipiv;
    magma_int_t M, N, n2, lda, ldda, n_local, ngpu;
    magma_int_t info, min_mn, nb, ldn_local;
    magma_int_t status = 0;

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            nb     = magma_get_dgetrf_nb( M );
            gflops = FLOPS_DGETRF( M, N ) / 1e9;
            
            // ngpu must be at least the number of blocks
            ngpu = min( opts.ngpu, int((N+nb-1)/nb) );
            if ( ngpu < opts.ngpu ) {
                printf( " * too many GPUs for the matrix size, using %d GPUs\n", (int) ngpu );
            }
            
            // Allocate host memory for the matrix
            TESTING_MALLOC_CPU( ipiv, magma_int_t,        min_mn );
            TESTING_MALLOC_CPU( h_A,  double, n2     );
            
            // Allocate device memory
            for( int dev=0; dev < ngpu; dev++ ) {
                n_local = ((N/nb)/ngpu)*nb;
                if (dev < (N/nb) % ngpu)
                    n_local += nb;
                else if (dev == (N/nb) % ngpu)
                    n_local += N % nb;
                ldn_local = ((n_local+31)/32)*32;  // TODO why?
                magma_setdevice( dev );
                TESTING_MALLOC_DEV( d_lA[dev], double, ldda*ldn_local );
            }
    
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_dgetrf( &M, &N, h_A, &lda, ipiv, &info );
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            magma_dsetmatrix_1D_col_bcyclic( M, N, h_A, lda, d_lA, ldda, ngpu, nb );
    
            gpu_time = magma_wtime();
            magma_dgetrf_mgpu( ngpu, M, N, d_lA, ldda, ipiv, &info );
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dgetrf_mgpu returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
                       
            magma_dgetmatrix_1D_col_bcyclic( M, N, d_lA, ldda, h_A, lda, ngpu, nb );
    
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d  %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d    ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf( "     ---\n" );
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_CPU( h_A );
            for( int dev=0; dev < ngpu; dev++ ) {
                magma_setdevice( dev );
                TESTING_FREE_DEV( d_lA[dev] );
            }
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
예제 #16
0
파일: fem.c 프로젝트: marcoheisig/numpde05
void fem(size_t n, double errors[2], double (*fn_f)(double, double),
         double (*fn_g)(unsigned char, double, double),
         double (*fn_u)(double, double))
{
  mesh m;
  crs_matrix mat;
  double * u, * rhs;
  double local_stiffness[3][3];
  double local_load[3];
  size_t elem;

  /* 1. Allocate and generate mesh */
  get_mesh(&m, n);

#ifdef PRINT_DEBUG
  print_mesh(&m);
#endif

  /* 2. Allocate the linear system */
  init_matrix(&mat, &m);

  u = (double *) malloc(sizeof(double) * m.n_vertices);
  if (u == NULL) err_exit("Allocation of solution vector failed!");
  memset(u, 0, sizeof(double) * m.n_vertices);

  rhs = (double *) malloc(sizeof(double) * m.n_vertices);
  if (rhs == NULL)
    err_exit("Allocation of right hand side failed!");
  memset(rhs, 0, sizeof(double) * m.n_vertices);

  /* 3. Assemble the matrix */
  for (elem = 0; elem <  m.n_triangles; ++elem)
  {
    /* Compute local stiffness and load */
    get_local_stiffness(local_stiffness, &m, elem);
    get_local_load(local_load, &m, elem, fn_f);

#ifdef PRINT_DEBUG
    print_local_stiffness(local_stiffness);
    print_local_load(local_load);
#endif

    /* insert into global matrix and rhs */
    assemble_local2global_stiffness(local_stiffness, &mat, &m, elem);
    assemble_local2global_load(local_load, rhs, &m, elem);
  }

#ifdef PRINT_DEBUG
  printf("Matrix after assembly:\n");
  print_matrix(&mat);
  printf("rhs after assembly:\n");
  for(size_t i = 0; i < m.n_vertices; ++i) {
      printf("%5.2f\n", rhs[i]);
  }
  printf("\n");
#endif

  /* 4. Apply boundary conditions */
  apply_dbc(&mat, rhs, &m, fn_g);

#ifdef PRINT_DEBUG
  printf("Matrix after application of BCs:\n");
  print_matrix(&mat);
  printf("rhs after application of BCs:\n");
  for(size_t i = 0; i < m.n_vertices; ++i) {
      printf("%5.2f\n", rhs[i]);
  }
  printf("\n");
#endif

  /* 5. Solve the linear system */
  solve(&mat, u, rhs);

  /* 6. Evaluate error */
  errors[0] = l2_norm(u, &m, fn_u);
  errors[1] = inf_norm(u, &m, fn_u);

  /* free allocated resources */
  free(u);
  free(rhs);
  rhs = NULL;
  free_matrix(&mat);
  free_mesh(&m);
}
/*
 * Función principal
 */
int main (int argc, char **argv) {

	if (argc > 3) {
		printf("\n%s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);

		int matrix1_fils = strtol(argv[1], (char **) NULL, 10);
		int matrix1_cols = strtol(argv[2], (char **) NULL, 10);
		int matrix2_fils = matrix1_cols;
		int matrix2_cols = strtol(argv[3], (char **) NULL, 10);

		// Inicialización de las matrices
		int i;

		int **matrix1 = (int **) calloc(matrix1_fils, sizeof(int*));

		for (i = 0; i < matrix1_fils; i++){
			matrix1[i] = (int *) calloc(matrix1_cols, sizeof(int));
		}

		int **matrix2 = (int **) calloc(matrix2_fils, sizeof(int*));
		for (i = 0; i < matrix2_fils; i++){
			matrix2[i] = (int *) calloc(matrix2_cols, sizeof(int));
		}

		int **matrixR = (int **) malloc(matrix1_fils * sizeof(int*));
		for (i = 0; i < matrix1_fils; i++){
			matrixR[i] = (int *) malloc(matrix2_cols * sizeof(int));
		}

		init_matrix(matrix1, matrix1_fils, matrix1_cols);
		init_matrix(matrix2, matrix2_fils, matrix2_cols);

		// Bucle principal
		int j, k, acum;

		for (j = 0; j < matrix2_cols; j++) {
			for (i = 0; i < matrix1_fils; i++) {
				acum = 0;
				for (k = 0; k < matrix1_cols; k++) {
					acum += matrix1[i][k] * matrix2[k][j];
				}
				matrixR[i][j] = acum;
			}
		}

		#ifdef DEBUG
		print_matrix(matrixR, matrix1_fils, matrix2_cols);
		#endif

		// Liberamos la memoria utilizada
		for (i = 0; i < matrix1_fils; i++) {
			free(matrix1[i]);
		}
		free(matrix1);

		for (i = 0; i < matrix2_fils; i++) {
			free(matrix2[i]);
		}
		free(matrix2);

		for (i = 0; i < matrix1_fils; i++) {
			free(matrixR[i]);
		}
		free(matrixR);

		return 0;
	}

	fprintf(stderr, "Uso: %s filas_matriz1 columnas_matriz1 columnas_matriz2\n", argv[0]);
	return -1;
}
예제 #18
0
int main(int argc, char *argv[]) {

   blocking_entry();

   long long int start;
   long long int end;

   start = get_micro_clock();

   int j, k, noproc, me_no;
   double sum;
   double t1, t2;

   pthread_t      *threads;
   pthread_attr_t  pthread_custom_attr;

   parm           *arg;
   int             n, i;

   if (argc != 3) {
      printf("Usage: %s n dim\n  where n is no. of thread and dim is the size of matrix\n", argv[0]);
      exit(1);
   }

   n = atoi(argv[1]);

   if ((n < 1) || (n > MAX_THREAD)) {
      printf("The no of thread should between 1 and %d.\n", MAX_THREAD);
      exit(1);
   }

   NDIM = atoi(argv[2]);

   pthread_mutex_init(&lock, NULL);

   init_matrix(&a);
   init_matrix(&b);
   init_matrix(&c);

   for (i = 0; i < NDIM; i++)
      for (j = 0; j < NDIM; j++)
      {
         a[i][j] = i + j;
         b[i][j] = i + j;
      }

   threads = (pthread_t*) malloc(n * sizeof(pthread_t));
   pthread_attr_init(&pthread_custom_attr);

   arg = (parm*) malloc(sizeof(parm) * n);
   /* setup barrier */

   /* Start up thread */

   /* Spawn thread */
   for (i = 0; i < n; i++) {
      arg[i].id = i;
      arg[i].noproc = n;
      arg[i].dim = NDIM;
      arg[i].a = a;
      arg[i].b = b;
      arg[i].c = c;
      pthread_create(&threads[i], &pthread_custom_attr, worker, (void*) (arg+i));
   }

   for (i = 0; i < n; i++)
   {
      pthread_join(threads[i], NULL);

   }
   /* print_matrix(NDIM); */
   check_matrix(NDIM);
   free(arg);

   end = get_micro_clock();
   fprintf(stderr, "> application runtime: %lld microseconds\n", end - start);

   return 0;
}
예제 #19
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgetrf
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float          error;
    magmaFloatComplex *h_A;
    magma_int_t     *ipiv;
    magma_int_t     M, N, n2, lda, ldda, info, min_mn;
    magma_int_t     status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[itest];
            N = opts.nsize[itest];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_CGETRF( M, N ) / 1e9;
            
            TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
            TESTING_MALLOC_PIN( h_A,  magmaFloatComplex, n2 );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_cgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            
            gpu_time = magma_wtime();
            magma_cgetrf( M, N, h_A, lda, ipiv, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgetrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e   %s\n", error, (error < tol ? "ok" : "failed"));
                status += ! (error < tol);
            }
            else {
                printf("     ---   \n");
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_PIN( h_A  );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
예제 #20
0
파일: main.c 프로젝트: ludcila/CFD-Lab
int main(int argn, char** args){

	double **U, **V, **P, **F, **G, **RS;
	int **Flag;
	char problem[60];
	char parameters_filename[60];
	char pgm[60];
	char output_dirname[60];
	double Re, UI, VI, PI, GX, GY, t_end, xlength, ylength, dt, dx, dy, alpha, omg, tau, eps, dt_value, dp;
	double res = 0, t = 0, n = 0;
	int imax, jmax, itermax, it;
	int wl, wr, wt, wb;
	int timestepsPerPlotting;
	char old_output_filename[128];
	struct dirent *old_outputfile;
	DIR *output_dir;
	/* Variables for parallel program */
	int iproc, jproc, myrank, il, ir, jb, jt, rank_l, rank_r, rank_b, rank_t, omg_i, omg_j, num_proc;
	double min_dt;
	double *bufSend, *bufRecv;
	double totalTime = 0;
	struct timespec previousTime, currentTime;
	
	MPI_Init(&argn, &args);
	MPI_Comm_size(MPI_COMM_WORLD, &num_proc);

	/* Read name of the problem from the command line arguments */
	if(argn > 1) {
		strcpy(problem, args[1]);
	} else {
		printf("\n=== ERROR: Please provide the name of the problem\n=== e.g. Run ./sim problem_name if there is a problem_name.dat file.\n\n");
		MPI_Finalize();
		return 1;
	}

	/* Generate input filename based on problem name */
	strcpy(parameters_filename, problem);
	strcat(parameters_filename, ".dat");

	/* Read the program configuration file using read_parameters() */
	read_parameters(parameters_filename, pgm, &Re, &UI, &VI, &PI, &GX, &GY, &t_end, &xlength, &ylength, &dt, &dx, &dy, &imax, &jmax, &alpha, &omg, &tau, &itermax, &eps, &dt_value, problem, &dp, &wl, &wr, &wt, &wb, &timestepsPerPlotting, &iproc, &jproc);
	printf("%s\n", pgm);
	
	/* Check if the number of processes is correct */
	if(iproc * jproc != num_proc) {
		printf("\n=== ERROR: Number of processes is incorrect (iproc=%d, jproc=%d, -np=%d) ===\n\n", iproc, jproc, num_proc);
		MPI_Finalize();
		return 1;
	}

	/* Create folder with the name of the problem */
	strcpy(output_dirname, problem);
	strcat(output_dirname, "/");
	strcat(output_dirname, problem);
	mkdir(problem, 0777);
	output_dir = opendir(problem);

	/* Delete existing files in output folder*/
	while((old_outputfile = readdir(output_dir))) {
		sprintf(old_output_filename, "%s/%s", problem, old_outputfile->d_name);
		remove(old_output_filename);
	}
	
	/* Determine subdomain and neighbours for each process */
	init_parallel(iproc, jproc, imax, jmax, &myrank, &il, &ir, &jb, &jt, &rank_l, &rank_r, &rank_b, &rank_t, &omg_i, &omg_j, num_proc);

	/* Set up the matrices (arrays) needed using the matrix() command */
	U = matrix(il-2, ir+1, jb-1, jt+1);
	V = matrix(il-1, ir+1, jb-2, jt+1);
	P = matrix(il-1, ir+1, jb-1, jt+1);
	F = matrix(il-2, ir+1, jb-1, jt+1);
	G = matrix(il-1, ir+1, jb-2, jt+1);
	RS= matrix(il, ir, jb, jt);
	Flag = imatrix(il-1, ir+1, jb-1, jt+1);
	
	/* Assign initial values to u, v, p */
	init_uvp(UI, VI, PI, il, ir, jb, jt, U, V, P);
	
	/* Allocate memory for buffers */
	bufSend = malloc(max(ir-il+3, jt-jb+3) * sizeof(double));
	bufRecv = malloc(max(ir-il+3, jt-jb+3) * sizeof(double));
	
	/* Initialize lower part of the domain with UI = 0 for the flow_over_step problem */
	/* (this code might be moved to somewhere else later) */
	if(strcmp(problem, "flow_over_step") == 0) {
		init_matrix(U, il, ir, jb, min(jmax/2, jt), 0);
	}

	/* Initialization of flag field */
	init_flag(pgm, imax, jmax, il, ir, jb, jt, Flag, dp);
	
	if(myrank == 0) {
		clock_gettime(CLOCK_MONOTONIC, &currentTime);
	}

	while(t <= t_end){
	
		/* Select δt */
		calculate_dt(Re, tau, &dt, dx, dy, il, ir, jb, jt, U, V);
		MPI_Allreduce(&dt, &min_dt, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);

		dt = min_dt;
		
		/* Set boundary values for u and v */
		boundaryvalues(il, ir, jb, jt, imax, jmax, U, V, wl, wr, wt, wb, Flag);
	
		/* Set special boundary values */
		spec_boundary_val(problem, il, ir, jb, jt, imax, jmax, U, V, P, Re, xlength, ylength, dp);

		/* Compute F(n) and G(n) */
		calculate_fg(Re, GX, GY, alpha, dt, dx, dy, il, ir, jb, jt, imax, jmax, U, V, F, G, Flag);
		
		/* Compute the right-hand side rs of the pressure equation */
		calculate_rs(dt, dx, dy, il, ir, jb, jt, imax, jmax, F, G, RS);
		
		/* Perform SOR iterations */
		it = 0;
		res = 1e6;
		while(it < itermax && res > eps){
			sor(omg, dx, dy, dp, il, ir, jb, jt, imax, jmax, rank_l, rank_r, rank_b, rank_t, P, RS, &res, Flag, bufSend, bufRecv);
			it++;
		}
		
		/* Compute u(n+1) and v(n+1) */
		calculate_uv(dt, dx, dy, il, ir, jb, jt, imax, jmax, U, V, F, G, P, Flag);
		
		/* Exchange velocity strips */
		uv_com(U, V, il, ir, jb, jt, rank_l, rank_r, rank_b, rank_t, bufSend, bufRecv);
		
		t = t + dt;
		n++;
		
		/* Generate snapshot for current timestep */
		if((int) n % timestepsPerPlotting == 0) {
			write_vtkFile(output_dirname, myrank, n, xlength, ylength, il, ir, jb, jt, imax, jmax, dx, dy, U, V, P);
		}
		
		/* Print out simulation time and whether the SOR converged */
		if(myrank == 0) {
			/* Print simulation time */
			printf("Time: %.4f", t);
			/* Print runtime */
			previousTime = currentTime;
			clock_gettime(CLOCK_MONOTONIC, &currentTime);
			totalTime += (double)currentTime.tv_sec + 1e-9 * currentTime.tv_nsec - (double)previousTime.tv_sec - 1e-9 * previousTime.tv_nsec;
			printf("\tRuntime: %.3f s (avg runtime/step: %.3f s)", totalTime, totalTime/n);
			if(res > eps) printf("\tDid not converge (res=%f, eps=%f)", res, eps);
			printf("\n");
		}

	}
	
	/* Close the output folder */
	closedir(output_dir);
	
	/* Tell user where to find the output */
	if(myrank == 0) {
		printf("Please find the output in the folder \"%s\".\n", problem);
	}
	
	/* Free allocated memory */
	free_matrix(U, il-2, ir+1, jb-1, jt+1);
	free_matrix(V, il-1, ir+1, jb-2, jt+1);
	free_matrix(P, il-1, ir+1, jb-1, jt+1);
	free_matrix(F, il-2, ir+1, jb-1, jt+1);
	free_matrix(G, il-1, ir+1, jb-2, jt+1);
	free_matrix(RS, il, ir, jb, jt);
	free_imatrix(Flag, il-1, ir+1, jb-1, jt+1);
	free(bufSend);
	free(bufRecv);
	
	MPI_Barrier(MPI_COMM_WORLD);
	MPI_Finalize();
	
	return 0;
	
}
예제 #21
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing cgetrf
*/
int main( int argc, char** argv)
{
    real_Double_t   gflops, gpu_perf, gpu_time, cpu_perf=0, cpu_time=0;
    float          error;
    magmaFloatComplex *h_A;
    magma_int_t     *ipiv;
    magma_int_t     M, N, n2, lda, ldda, info, min_mn;
    magma_int_t     status = 0;

    /* Initialize */
    magma_queue_t  queue[2];
    magma_device_t devices[MagmaMaxGPUs];
    int num = 0;
    magma_err_t err;
    magma_init();
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    float tol = opts.tolerance * lapackf77_slamch("E");

    err = magma_get_devices( devices, MagmaMaxGPUs, &num );
    if ( err != 0 || num < 1 ) {
      fprintf( stderr, "magma_get_devices failed: %d\n", err );
      exit(-1);
    }

    // Create two queues on device opts.device
    err = magma_queue_create( devices[opts.device], &queue[0] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }
    err = magma_queue_create( devices[opts.device], &queue[1] );
    if ( err != 0 ) {
      fprintf( stderr, "magma_queue_create failed: %d\n", err );
      exit(-1);
    }

    printf("ngpu %d\n", (int) opts.ngpu );
    if ( opts.check == 2 ) {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    }
    else {
        printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |PA-LU|/(N*|A|)\n");
    }
    printf("=========================================================================\n");
    for( int i = 0; i < opts.ntest; ++i ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            M = opts.msize[i];
            N = opts.nsize[i];
            min_mn = min(M, N);
            lda    = M;
            n2     = lda*N;
            ldda   = ((M+31)/32)*32;
            gflops = FLOPS_CGETRF( M, N ) / 1e9;
            
            TESTING_MALLOC_CPU( ipiv, magma_int_t, min_mn );
            TESTING_MALLOC_PIN( h_A,  magmaFloatComplex, n2 );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                init_matrix( M, N, h_A, lda );
                
                cpu_time = magma_wtime();
                lapackf77_cgetrf(&M, &N, h_A, &lda, ipiv, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_cgetrf returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
            }
            
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( M, N, h_A, lda );
            
            gpu_time = magma_wtime();
            magma_cgetrf( M, N, h_A, lda, ipiv, &info, queue);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_cgetrf returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) M, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) M, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 2 ) {
                error = get_residual( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else if ( opts.check ) {
                error = get_LU_error( M, N, h_A, lda, ipiv );
                printf("   %8.2e%s\n", error, (error < tol ? "" : "  failed"));
                status |= ! (error < tol);
            }
            else {
                printf("     ---   \n");
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_PIN( h_A );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    magma_queue_destroy( queue[0] );
    magma_queue_destroy( queue[1] );
    magma_finalize();

    return status;
}
/**** WEKA specific functions *********/
matrix * WEKApopulateAccuracyMatrix(struct hash * config, int split, int fold)
{
char * trainingDir = hashMustFindVal(config, "trainingDir");
char * validationDir = hashMustFindVal(config, "validationDir");
char * modelDir = hashMustFindVal(config, "modelDir");
char filename[256];

//cat togetehr the training and validation KH values and record which were used to train
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", trainingDir, split, fold);
matrix * trMetadata = WEKAtoMetadataMatrix(filename);
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/data.arff", validationDir, split, fold);
matrix * valMetadata = WEKAtoMetadataMatrix(filename);
matrix * metadata = append_matrices(trMetadata, valMetadata, 1);
struct slInt * trainingList = list_indices(trMetadata->cols);
	
//create a labeled matrix for results to be stored in
matrix * result = init_matrix(2, metadata->cols);
safef(result->rowLabels[0], MAX_LABEL, "trainingAccuracies");
safef(result->rowLabels[1], MAX_LABEL, "testingAccuracies");
copy_matrix_labels(result, metadata, 2,2);
result->labels=1;

//read the results from file
safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.training.results", modelDir, split, fold);
FILE * fp = fopen(filename, "r");
if(fp == NULL)
	errAbort("Couldn't open %s for reading.", filename);
//advance the cursor to where data starts
char * line;
while( (line = readLine(fp)) && line != NULL)
	{
	if(strstr(line, "inst#") != NULL)
		break;
	}
//read each result and save to results matrix
int i;
for(i = 0; i < trMetadata->cols && (line = readLine(fp)) != NULL; i++)
	{
	if(strstr(line, ":?") == NULL)
		{
		if(strstr(line, " + ") == NULL)
			result->graph[0][i] = 1;
		else
			result->graph[0][i] = 0;
		}
	}

safef(filename, sizeof(filename), "%s/split%02d/fold%02d/weka.validation.results", modelDir, split, fold);
fp = fopen(filename, "r");
if(fp == NULL)
    errAbort("Couldn't open %s for reading.", filename);
//advance the cursor to where data starts
while( (line = readLine(fp)) && line != NULL)
    {
    if(strstr(line, "inst#") != NULL)
        break;
    }
//read each result and save to results matrix
for(i = i; i < result->cols && (line = readLine(fp)) != NULL; i++)
    {
    if(strstr(line, ":?") == NULL)
        {
        if(strstr(line, " + ") == NULL)
            result->graph[1][i] = 1;
        else
            result->graph[1][i] = 0;
        }
    }


free_matrix(trMetadata);
free_matrix(valMetadata);
free_matrix(metadata);
slFreeList(&trainingList);

return result;
}
예제 #23
0
파일: json.c 프로젝트: GeorgeShaw/opensips
int pv_parse_json_name (pv_spec_p sp, str *in)
{
	json_name * id;
	char * cur,* start;
	int state,next_state,prev_state;

	if( !inited )
		init_matrix();


	id = (json_name *) pkg_malloc(sizeof(json_name));
	if( id == NULL )
	{
		LM_ERR("Out of memory\n");
		return -1;
	}

	id->tags = NULL;
	id->end = &id->tags;


	state = ST_NAME;
	start = in->s;
	prev_state = -1;

	for( cur = in->s; cur < in->s + in->len; cur++)
	{
		next_state = next[state][(unsigned int)*cur];

		if( next_state == ST_ERR)
		{
			LM_ERR("Unexpected char at position: %d in :(%.*s)\n",
				(int)(cur-in->s),in->len,in->s);
			return -1;
		}

		if( state != prev_state)
			start = cur;

		if( state != next_state)
			if ( get_value(state, id, start, cur) )
				return -1;


		if( ignore[state][(unsigned int)*cur])
		{
			cur --;
		}

		prev_state = state;
		state = next_state;

	}

	if( state == ST_IDX)
	{
		LM_ERR("Mismatched paranthesis in:(%.*s)\n",in->len,in->s);
		return -1;
	}


	if( get_value(state, id, start, cur) )
		return -1;


	sp->pvp.pvn.u.dname = id ;
	sp->type = PV_JSON_ID;
	sp->getf = pv_get_json;
	sp->setf = pv_set_json;


	return 0;
}
예제 #24
0
int main(int argc, char *argv[])
{
     int portno;
     socklen_t clilen;
    
     struct sockaddr_in serv_addr, cli_addr;
     int n;
     
     if (argc < 2) {
         fprintf(stderr,"ERROR, no port provided\n");
         exit(1);
     }
     
     sockfd = socket(AF_INET, SOCK_STREAM, 0);
     if (sockfd < 0) 
        error("ERROR opening socket");
     
        
        
     bzero((char *) &serv_addr, sizeof(serv_addr));
     
     
     portno = atoi(argv[1]);
     
     serv_addr.sin_family = AF_INET;
     serv_addr.sin_addr.s_addr =inet_addr("127.0.0.1"); //INADDR_ANY;
     serv_addr.sin_port = htons(portno);
     
     if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) 
             error("ERROR on binding");
             
     listen(sockfd,5);
     clilen = sizeof(cli_addr);

     puts("This is the game of Tic Tac Toe.\n");
     puts("waiting connection to established....\n");

     
      newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen);
      if (newsockfd < 0) 
          error("ERROR on accept");
    
     
    char done;

  
  done =  ' ';
  init_matrix();
  
  
  do {
  	
	    disp_matrix();
	    puts("waiting for client move\n");
	     get_client_move();
	    
	    disp_matrix();
	    done = check(); /* see if winner */
	    if(done!= ' ') break; /* winner!*/
	    get_player_move();
	  
	    disp_matrix();
	    done = check(); /* see if winner */
  } while(done== ' ');

	  if(done=='X') printf("Player X won!\n");
	  else printf("Player O won!!!!\n");
	  disp_matrix(); /* show final positions */
     
        
        
        
        
        
     close(newsockfd);
     close(sockfd);
     return 0; 
}
예제 #25
0
int
main(int argc, char **argv)
{
    int myrank, nproc;
    int rows, columns; /* amount of work per node (rows per worker) */
    int mtype; /* message type: send/recv between master and workers */
    int dest, src, offsetrow, offsetcolumn;
    double start_time, end_time;
    int i, j, k;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
	
	if (nproc == 4 || nproc == 2) {
		rows = SIZE / 2;
	} else if (nproc == 1) {
		rows = SIZE;
	}
	
	if (nproc == 4) {
		columns = SIZE / 2;
	} else if (nproc == 2 || nproc == 1) {
		columns = SIZE;
	} 
	MPI_Type_contiguous(SIZE*rows,MPI_DOUBLE,&rowtype);
	MPI_Type_commit(&rowtype);
	MPI_Type_vector(SIZE,columns,SIZE,MPI_DOUBLE,&columntype);
	MPI_Type_commit(&columntype);
	MPI_Type_vector(rows,columns,SIZE,MPI_DOUBLE,&resulttype);
	MPI_Type_commit(&resulttype);

    if (myrank == 0) {
	/* Master task */

		/* Initialization */
		// printf("SIZE = %d, number of nodes = %d\n", SIZE, nproc);
		init_matrix();
		start_time = MPI_Wtime();

				/* Send part of matrix a and the whole matrix b to workers */

		
		mtype = FROM_MASTER;
		offsetrow = 0;
		offsetcolumn = 0;
		for (dest = 1; dest < nproc; dest++) {
		
		
			if (DEBUG)
				printf("   sending %d rows and %d columns to task %d\n",rows,columns,dest);
			offsetrow = (offsetrow+rows)%SIZE;	
			if (dest == 2) {
				offsetcolumn = (offsetcolumn+columns)%SIZE;
			}	
			
			MPI_Send(&offsetrow, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
			MPI_Send(&offsetcolumn, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
			MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
			MPI_Send(&columns, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
			MPI_Send(&a[offsetrow][0], 1, rowtype, dest, mtype, MPI_COMM_WORLD);
			MPI_Send(&b[0][offsetcolumn], 1, columntype, dest, mtype, MPI_COMM_WORLD);
			

		}
		printf(" ---- SEND ----- Execution time on %2d nodes: %5.2f\n", myrank, MPI_Wtime()-start_time);
		/* let master do its part of the work */
		for (i = 0; i < rows; i++) {
			for (j = 0; j < columns; j++) {
				c[i][j] = 0;
				for (k = 0; k < SIZE; k++)
				{
					c[i][j] += a[i][k] * b[k][j];
				}
			}
		}
		printf("---- algo ----- Execution time on %2d nodes: %5.2f\n",  myrank, MPI_Wtime()-start_time);

		/* collect the results from all the workers */
		mtype = FROM_WORKER;
		for (src = 1; src < nproc; src++) {
			MPI_Recv(&offsetrow, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
			MPI_Recv(&offsetcolumn, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
			MPI_Recv(&rows, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
			MPI_Recv(&columns, 1, MPI_INT, src, mtype, MPI_COMM_WORLD, &status);
			MPI_Recv(&c[offsetrow][offsetcolumn], 1, resulttype, src, mtype, MPI_COMM_WORLD, &status);
			if (DEBUG)
			printf("   recvd %d rows and %d columns from task %d, offsetrow = %d, offsetcolumn = %d\n",
				   rows, columns, src, offsetrow, offsetcolumn);
		}
		printf(" ---- RECV ----- Execution time on %2d nodes: %5.2f\n", myrank, MPI_Wtime()-start_time);
		end_time = MPI_Wtime();

		printf("Execution time on %2d nodes: %f\n", nproc, end_time-start_time);
		//if (DEBUG)
			/* Prints the resulting matrix c */
			//print_matrix();
    } else {
	/* Worker tasks */


		/* Receive data from master */
		mtype = FROM_MASTER;
		MPI_Recv(&offsetrow, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
		MPI_Recv(&offsetcolumn, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
		MPI_Recv(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);
		MPI_Recv(&columns, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD, &status);	
		
		
		MPI_Recv(&a[offsetrow][0],1, rowtype, 0, mtype, MPI_COMM_WORLD, &status);
		MPI_Recv(&b[0][offsetcolumn], 1, columntype, 0, mtype, MPI_COMM_WORLD, &status);
		if (DEBUG)
			printf ("Rank=%d, offsetrow=%d,offsetcolumn=%d, row =%d, column=%d, a[offsetrow][0]=%e, b[0][offsetcolumn]=%e\n",
				myrank, offsetrow, offsetcolumn, rows, columns,  a[offsetrow][0], b[0][offsetcolumn]);

		/* do the workers part of the calculation */
		for (i=offsetrow; i<offsetrow+rows; i++) {
			for (j=offsetcolumn; j<offsetcolumn+columns; j++) {
				c[i][j] = 0.0;
				for (k=0; k<SIZE; k++){
					c[i][j] = c[i][j] + a[i][k] * b[k][j];
				}
			}
		}
		if (DEBUG)
			printf ("Rank=%d, offsetrow=%d,  offsetcolumn=%d,row =%d, column=%d, c[offsetrow][0]=%e\n",
				myrank, offsetrow,offsetcolumn, rows, columns, a[offsetrow][0]);

		/* send the results to the master */
		mtype = FROM_WORKER;
		MPI_Send(&offsetrow, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
		MPI_Send(&offsetcolumn, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
		MPI_Send(&rows, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
		MPI_Send(&columns, 1, MPI_INT, 0, mtype, MPI_COMM_WORLD);
		MPI_Send(&c[offsetrow][offsetcolumn], 1, resulttype, 0, mtype, MPI_COMM_WORLD);

    }

    MPI_Finalize();
    return 0;
}
예제 #26
0
파일: test.c 프로젝트: cirqueit/mxp
int main(void)
{
	pixel *input;
	pixel *scalar_input;

#if USE_LUMA
	unsigned char  *vbx_luma;
#endif
	unsigned short *scalar_luma;

	pixel *vbx_output;
	pixel *scalar_output;

	vbx_timestamp_t time_start, time_stop;
	double scalar_time, vbx_time;
	int x, y;
	int errors = 0;

	vbx_test_init();

	vbx_mxp_print_params();

	input         = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
	scalar_input  = (pixel *)vbx_remap_cached(input, IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
#if USE_LUMA
	vbx_luma      = (unsigned char *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned char));
#endif
	scalar_luma   = (unsigned short *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(unsigned short));
	vbx_output    = (pixel *)vbx_shared_malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));
	scalar_output = (pixel *)malloc(IMAGE_WIDTH*IMAGE_HEIGHT*sizeof(pixel));

	printf("\nInitializing data\n");
	printf("Resolution = %dx%d\n", IMAGE_WIDTH, IMAGE_HEIGHT);
	init_matrix(input, IMAGE_WIDTH, IMAGE_HEIGHT);

	printf("Starting Sobel 3x3 edge-detection test\n");

#if USE_LUMA
	scalar_rgb2luma(scalar_luma, scalar_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH);
#endif
	vbx_timestamp_start();
	time_start = vbx_timestamp();
#if !USE_LUMA
	scalar_rgb2luma(scalar_luma, scalar_input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH);
#endif
	scalar_sobel_argb32_3x3(scalar_output, scalar_luma, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT);
	time_stop = vbx_timestamp();
	scalar_time = vbx_print_scalar_time(time_start, time_stop);

#if USE_LUMA
	vbw_rgb2luma8(vbx_luma, (unsigned *)input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH);
#endif
	vbx_timestamp_start();
	time_start = vbx_timestamp();
#if USE_LUMA
	vbw_sobel_luma8_3x3((unsigned *)vbx_output, vbx_luma, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT);
#else
	vbw_sobel_argb32_3x3((unsigned *)vbx_output, (unsigned *)input, IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_PITCH, RENORM_AMOUNT);
#endif
	time_stop = vbx_timestamp();
	vbx_time = vbx_print_vector_time(time_start, time_stop, scalar_time);

	for (y = 0; y < IMAGE_HEIGHT; y++) {
		for (x = 0; x < IMAGE_WIDTH; x++) {
#if USE_LUMA
			if (scalar_luma[y*IMAGE_WIDTH+x] != vbx_luma[y*IMAGE_WIDTH+x]) {
				if (errors < MAX_PRINT_ERRORS) {
					printf("Y Error at %d, %d: Expected = %02X, got = %02X\n",
							y, x, scalar_luma[y*IMAGE_WIDTH+x], vbx_luma[y*IMAGE_WIDTH+x]);
				}
				errors++;
			}
#endif
			if (scalar_output[y*IMAGE_WIDTH+x].r != vbx_output[y*IMAGE_WIDTH+x].r) {
				if (errors < MAX_PRINT_ERRORS) {
					printf("R Error at %d, %d: Expected = %02X, got = %02X\n",
							y, x, scalar_output[y*IMAGE_WIDTH+x].r, vbx_output[y*IMAGE_WIDTH+x].r);
				}
				errors++;
			}
			if (scalar_output[y*IMAGE_WIDTH+x].g != vbx_output[y*IMAGE_WIDTH+x].g) {
				if (errors < MAX_PRINT_ERRORS) {
					printf("G Error at %d, %d: Expected = %02X, got = %02X\n",
							y, x, scalar_output[y*IMAGE_WIDTH+x].g, vbx_output[y*IMAGE_WIDTH+x].g);
				}
				errors++;
			}
			if (scalar_output[y*IMAGE_WIDTH+x].b != vbx_output[y*IMAGE_WIDTH+x].b) {
				if (errors < MAX_PRINT_ERRORS) {
					printf("B Error at %d, %d: Expected = %02X, got = %02X\n",
							y, x, scalar_output[y*IMAGE_WIDTH+x].b, vbx_output[y*IMAGE_WIDTH+x].b);
				}
				errors++;
			}
		}
	}

	VBX_TEST_END(errors);
	return errors;
}
예제 #27
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dsysv
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    double *h_A, *h_B, *h_X, *work, temp;
    real_Double_t   gflops, gpu_perf, gpu_time = 0.0, cpu_perf=0, cpu_time=0;
    double          error, error_lapack = 0.0;
    magma_int_t     *ipiv;
    magma_int_t     N, n2, lda, ldb, sizeB, lwork, info;
    magma_int_t     status = 0, ione = 1;
    magma_int_t     ISEED[4] = {0,0,0,1};

    magma_opts opts;
    parse_opts( argc, argv, &opts );
    
    double tol = opts.tolerance * lapackf77_dlamch("E");

    printf("    M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)   |Ax-b|/(N*|A|*|x|)\n");
    printf("=========================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            N = opts.nsize[itest];
            ldb    = N;
            lda    = N;
            n2     = lda*N;
            sizeB  = ldb*opts.nrhs;
            gflops = ( FLOPS_DPOTRF( N ) + FLOPS_DPOTRS( N, opts.nrhs ) ) / 1e9;
            
            TESTING_MALLOC_CPU( ipiv, magma_int_t, N );
            TESTING_MALLOC_PIN( h_A,  double, n2 );
            TESTING_MALLOC_PIN( h_B,  double, sizeB );
            TESTING_MALLOC_PIN( h_X,  double, sizeB );
            
            /* =====================================================================
               Performs operation using LAPACK
               =================================================================== */
            if ( opts.lapack ) {
                lwork = -1;
                lapackf77_dsysv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs, 
                                h_A, &lda, ipiv, h_X, &ldb, &temp, &lwork, &info);
                lwork = (int)MAGMA_D_REAL(temp);
                TESTING_MALLOC_CPU( work, double, lwork );

                init_matrix( N, N, h_A, lda );
                lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B );
                lapackf77_dlacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb );

                cpu_time = magma_wtime();
                lapackf77_dsysv(lapack_uplo_const(opts.uplo), &N, &opts.nrhs,
                                h_A, &lda, ipiv, h_X, &ldb, work, &lwork, &info);
                cpu_time = magma_wtime() - cpu_time;
                cpu_perf = gflops / cpu_time;
                if (info != 0)
                    printf("lapackf77_dsysv returned error %d: %s.\n",
                           (int) info, magma_strerror( info ));
                error_lapack = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb );

                TESTING_FREE_CPU( work );
            }
           
            /* ====================================================================
               Performs operation using MAGMA
               =================================================================== */
            init_matrix( N, N, h_A, lda );
            lapackf77_dlarnv( &ione, ISEED, &sizeB, h_B );
            lapackf77_dlacpy( MagmaUpperLowerStr, &N, &opts.nrhs, h_B, &ldb, h_X, &ldb );

            magma_setdevice(0);
            gpu_time = magma_wtime();
            magma_dsysv( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, &info);
            gpu_time = magma_wtime() - gpu_time;
            gpu_perf = gflops / gpu_time;
            if (info != 0)
                printf("magma_dsysv returned error %d: %s.\n",
                       (int) info, magma_strerror( info ));
            
            /* =====================================================================
               Check the factorization
               =================================================================== */
            if ( opts.lapack ) {
                printf("%5d %5d   %7.2f (%7.2f)   %7.2f (%7.2f)",
                       (int) N, (int) N, cpu_perf, cpu_time, gpu_perf, gpu_time );
            }
            else {
                printf("%5d %5d     ---   (  ---  )   %7.2f (%7.2f)",
                       (int) N, (int) N, gpu_perf, gpu_time );
            }
            if ( opts.check == 0 ) {
                printf("     ---   \n");
            } else {
                error = get_residual( opts.uplo, N, opts.nrhs, h_A, lda, ipiv, h_X, ldb, h_B, ldb );
                printf("   %8.2e   %s", error, (error < tol ? "ok" : "failed"));
                if (opts.lapack)
                    printf(" (lapack rel.res. = %8.2e)", error_lapack);
                printf("\n");
                status += ! (error < tol);
            }
            
            TESTING_FREE_CPU( ipiv );
            TESTING_FREE_PIN( h_X  );
            TESTING_FREE_PIN( h_B  );
            TESTING_FREE_PIN( h_A  );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }

    TESTING_FINALIZE();
    return status;
}
예제 #28
0
/* ////////////////////////////////////////////////////////////////////////////
   -- Testing dswap, dswapblk, dpermute, dlaswp, dlaswpx
*/
int main( int argc, char** argv)
{
    TESTING_INIT();

    double *h_A1, *h_A2;
    double *d_A1, *d_A2;
    double *h_R1, *h_R2;
    
    // row-major and column-major performance
    real_Double_t row_perf0, col_perf0;
    real_Double_t row_perf1, col_perf1;
    real_Double_t row_perf2, col_perf2;
    real_Double_t row_perf3;
    real_Double_t row_perf4;
    real_Double_t row_perf5, col_perf5;
    real_Double_t row_perf6, col_perf6;
    real_Double_t row_perf7;
    real_Double_t cpu_perf;

    real_Double_t time, gbytes;

    magma_int_t N, lda, ldda, nb, j;
    magma_int_t ione = 1;
    magma_int_t *ipiv, *ipiv2;
    magma_int_t *d_ipiv;
    magma_int_t status = 0;
    
    magma_opts opts;
    parse_opts( argc, argv, &opts );

    magma_queue_t queue = 0;
    
    printf("            cublasDswap       dswap             dswapblk          dlaswp   dpermute dlaswp2  dlaswpx           dcopymatrix      CPU      (all in )\n");
    printf("    N   nb  row-maj/col-maj   row-maj/col-maj   row-maj/col-maj   row-maj  row-maj  row-maj  row-maj/col-maj   row-blk/col-blk  dlaswp   (GByte/s)\n");
    printf("==================================================================================================================================================\n");
    for( int itest = 0; itest < opts.ntest; ++itest ) {
        for( int iter = 0; iter < opts.niter; ++iter ) {
            // For an N x N matrix, swap nb rows or nb columns using various methods.
            // Each test is assigned one bit in the 'check' bitmask; bit=1 indicates failure.
            // The variable 'shift' keeps track of which bit is for current test
            int shift = 1;
            int check = 0;
            N = opts.nsize[itest];
            lda    = N;
            ldda   = ((N+31)/32)*32;
            nb     = (opts.nb > 0 ? opts.nb : magma_get_dgetrf_nb( N ));
            nb     = min( N, nb );
            // each swap does 2N loads and 2N stores, for nb swaps
            gbytes = sizeof(double) * 4.*N*nb / 1e9;
                        
            TESTING_MALLOC_PIN( h_A1, double, lda*N );
            TESTING_MALLOC_PIN( h_A2, double, lda*N );
            TESTING_MALLOC_PIN( h_R1, double, lda*N );
            TESTING_MALLOC_PIN( h_R2, double, lda*N );
            
            TESTING_MALLOC_CPU( ipiv,  magma_int_t, nb );
            TESTING_MALLOC_CPU( ipiv2, magma_int_t, nb );
            
            TESTING_MALLOC_DEV( d_ipiv, magma_int_t, nb );
            TESTING_MALLOC_DEV( d_A1, double, ldda*N );
            TESTING_MALLOC_DEV( d_A2, double, ldda*N );
            
            for( j=0; j < nb; j++ ) {
                ipiv[j] = (magma_int_t) ((rand()*1.*N) / (RAND_MAX * 1.)) + 1;
            }
            
            /* =====================================================================
             * cublasDswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasDswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    cublasDswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf0 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dswap, row-by-row (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_dswap( N, d_A1+ldda*j, 1, d_A2+ldda*(ipiv[j]-1), 1);
                }
            }
            time = magma_sync_wtime( queue ) - time;
            row_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    magmablas_dswap( N, d_A1+j, ldda, d_A2+ipiv[j]-1, ldda );
                }
            }
            time = magma_sync_wtime( queue ) - time;
            col_perf1 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dswapblk, blocked version (2 matrices)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dswapblk( MagmaRowMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            row_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A2+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;
            
            /* Column Major */
            init_matrix( N, N, h_A1, lda, 0 );
            init_matrix( N, N, h_A2, lda, 100 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            magma_dsetmatrix( N, N, h_A2, lda, d_A2, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dswapblk( MagmaColMajor, N, d_A1, ldda, d_A2, ldda, 1, nb, ipiv, 1, 0);
            time = magma_sync_wtime( queue ) - time;
            col_perf2 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+j, &lda, h_A2+(ipiv[j]-1), &lda);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            magma_dgetmatrix( N, N, d_A2, ldda, h_R2, lda );
            check += (diff_matrix( N, N, h_A1, lda, h_R1, lda ) ||
                      diff_matrix( N, N, h_A2, lda, h_R2, lda ))*shift;
            shift *= 2;

            /* =====================================================================
             * dpermute_long (1 matrix)
             */
            
            /* Row Major */
            memcpy( ipiv2, ipiv, nb*sizeof(magma_int_t) );  // dpermute updates ipiv2
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dpermute_long2( N, d_A1, ldda, ipiv2, nb, 0 );
            time = magma_sync_wtime( queue ) - time;
            row_perf3 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswp (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswp( N, d_A1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf4 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswp (1 matrix) - d_ipiv on GPU
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magma_setvector( nb, sizeof(magma_int_t), ipiv, 1, d_ipiv, 1 );
            magmablas_dlaswp2( N, d_A1, ldda, 1, nb, d_ipiv, 1 );
            time = magma_sync_wtime( queue ) - time;
            row_perf7 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * LAPACK-style dlaswpx (extended for row- and col-major) (1 matrix)
             */
            
            /* Row Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswpx( N, d_A1, ldda, 1, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            row_perf5 = gbytes / time;
            
            for( j=0; j < nb; j++) {
                if ( j != (ipiv[j]-1)) {
                    blasf77_dswap( &N, h_A1+lda*j, &ione, h_A1+lda*(ipiv[j]-1), &ione);
                }
            }
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;
            
            /* Col Major */
            init_matrix( N, N, h_A1, lda, 0 );
            magma_dsetmatrix( N, N, h_A1, lda, d_A1, ldda );
            
            time = magma_sync_wtime( queue );
            magmablas_dlaswpx( N, d_A1, 1, ldda, 1, nb, ipiv, 1);
            time = magma_sync_wtime( queue ) - time;
            col_perf5 = gbytes / time;
            
            time = magma_wtime();
            lapackf77_dlaswp( &N, h_A1, &lda, &ione, &nb, ipiv, &ione);
            time = magma_wtime() - time;
            cpu_perf = gbytes / time;
            magma_dgetmatrix( N, N, d_A1, ldda, h_R1, lda );
            check += diff_matrix( N, N, h_A1, lda, h_R1, lda )*shift;
            shift *= 2;

            /* =====================================================================
             * Copy matrix.
             */
            
            time = magma_sync_wtime( queue );
            magma_dcopymatrix( N, nb, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            col_perf6 = 0.5 * gbytes / time;
            
            time = magma_sync_wtime( queue );
            magma_dcopymatrix( nb, N, d_A1, ldda, d_A2, ldda );
            time = magma_sync_wtime( queue ) - time;
            // copy reads 1 matrix and writes 1 matrix, so has half gbytes of swap
            row_perf6 = 0.5 * gbytes / time;

            printf("%5d  %3d  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c  %6.2f%c/ %6.2f%c  %6.2f / %6.2f  %6.2f  %10s\n",
                   (int) N, (int) nb,
                   row_perf0, ((check & 0x001) != 0 ? '*' : ' '),
                   col_perf0, ((check & 0x002) != 0 ? '*' : ' '),
                   row_perf1, ((check & 0x004) != 0 ? '*' : ' '),
                   col_perf1, ((check & 0x008) != 0 ? '*' : ' '),
                   row_perf2, ((check & 0x010) != 0 ? '*' : ' '),
                   col_perf2, ((check & 0x020) != 0 ? '*' : ' '),
                   row_perf3, ((check & 0x040) != 0 ? '*' : ' '),
                   row_perf4, ((check & 0x080) != 0 ? '*' : ' '),
                   row_perf7, ((check & 0x100) != 0 ? '*' : ' '),
                   row_perf5, ((check & 0x200) != 0 ? '*' : ' '),
                   col_perf5, ((check & 0x400) != 0 ? '*' : ' '),
                   row_perf6,
                   col_perf6,
                   cpu_perf,
                   (check == 0 ? "ok" : "* failed") );
            status += ! (check == 0);
            
            TESTING_FREE_PIN( h_A1 );
            TESTING_FREE_PIN( h_A2 );
            TESTING_FREE_PIN( h_R1 );
            TESTING_FREE_PIN( h_R2 );
            
            TESTING_FREE_CPU( ipiv  );
            TESTING_FREE_CPU( ipiv2 );
            
            TESTING_FREE_DEV( d_ipiv );
            TESTING_FREE_DEV( d_A1 );
            TESTING_FREE_DEV( d_A2 );
            fflush( stdout );
        }
        if ( opts.niter > 1 ) {
            printf( "\n" );
        }
    }
    
    TESTING_FINALIZE();
    return status;
}
예제 #29
0
파일: mat.cpp 프로젝트: DamenStar/FRC-2012
ss_controller::ss_controller(int inputs, int outputs, int states, controllers controller) :
  num_inputs(inputs),
  num_outputs(outputs),
  num_states(states)
{
  //initalizes all the matrices
  A = init_matrix(num_states, num_states);
  B = init_matrix(num_states, num_outputs);
  C = init_matrix(num_outputs, num_states);
  D = init_matrix(num_outputs, num_outputs);
  L = init_matrix(num_states, num_outputs);
  K = init_matrix(num_outputs, num_states);
  X = init_matrix(num_states, 1);
  X_hat = init_matrix(num_states, 1);
  U = init_matrix(num_outputs, 1);
  U_max = init_matrix(num_outputs, 1);
  U_min = init_matrix(num_outputs, 1);
  U_tmp = init_matrix(num_states, 1);
  b_u = init_matrix(num_states, 1);
  l_y = init_matrix(num_states, 1);
  l_c = init_matrix(num_states, num_states);
  a_lc = init_matrix(num_states, num_states);
  alc_xhat = init_matrix(num_states, 1);
  xhatp1 = init_matrix(num_states, 1);

  //import the matlab-computed matrix values
  switch (controller) {
    case SHOOTER:
      #include "shootercontroller.h"
      break;
    case DRIVE:
      #include "drivecontroller.h"
      break;
    default:
      break;
  }
}
예제 #30
0
파일: main.c 프로젝트: SteMaBa/gi4_uebung06
int main(int argc, char **argv)
{
	unsigned int i, j;
	unsigned int iterations = 0;
	double error, xi, norm, max = 0.0;

	//Neue Variablen
	double sum = 0.0;
	double epsilon = sqrt(0.00000001*MATRIX_SIZE);
	double sumindistance = 0.0;
	//Neue Variablen end

	struct timeval start, end;

	printf("\nInitialize system of linear equations...\n");
	/* allocate memory for the system of linear equations */
	init_matrix(&A, &b, MATRIX_SIZE);
	X = (double *)malloc(sizeof(double) * MATRIX_SIZE);
	X_old = (double *)malloc(sizeof(double) * MATRIX_SIZE);

	/* a "random" solution vector */
	for (i = 0; i < MATRIX_SIZE; i++) {
		X[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
		X_old[i] = 0.0;
	}

	printf("Start Jacobi method...\n");

	gettimeofday(&start, NULL);


	/* TODO: Hier muss die Aufgabe geloest werden */
	

	norm = 1.0;
	

	//Loesung suchen, bis Abstand aufeinanderfolgender Loesungen sehr klein ist
	while (norm > epsilon)
	{
	
	//Alle X einmal durchgehen
	for (i = 0; i < MATRIX_SIZE; i++)
	{
	
	        //Summe berechnen
	        sum = 0.0;
	        for (j = 0; j < MATRIX_SIZE; j ++)
	        {
	                if (j == i)
	                {
	                        j++;
	                }
	        
	                sum = sum + A[i][j]*X_old[j];
	        
	        }//Summe end
	        
	        xi = X[i];
	        X[i] = 1 / A[i][i] * (b[i] - sum);
	        X_old[i] = xi;

	}//Alle X end
		
		
		
	//Abstand berechnen
		
	//Summe im Abstand
	sumindistance = 0.0;	
	for (i = 0; i < MATRIX_SIZE; i ++)
	{
	      sumindistance = sumindistance + (X_old[i]-X[i])*(X_old[i]-X[i]);
	        
	}//Abstandsumme end	
	
	norm = sqrt(sumindistance);
		
	
	iterations++;

	}//while end
	
	
	

	gettimeofday(&end, NULL);

	if (MATRIX_SIZE < 16) {
		printf("Print the solution...\n");
		/* print solution */
		for (i = 0; i < MATRIX_SIZE; i++) {
			for (j = 0; j < MATRIX_SIZE; j++)
				printf("%8.2f\t", A[i][j]);
			printf("*\t%8.2f\t=\t%8.2f\n", X[i], b[i]);
		}
	}

	printf("Check the result...\n");
	/* 
	 * check the result 
	 * X[i] have to be 1
	 */
	for (i = 0; i < MATRIX_SIZE; i++) {
		error = fabs(X[i] - 1.0f);

		if (max < error)
			max = error;
		if (error > 0.01f)
			printf("Result is on position %d wrong (%f != 1.0)\n",
			       i, X[i]);
	}
	printf("maximal error is %f\n", max);

	printf("\nmatrix size: %d x %d\n", MATRIX_SIZE, MATRIX_SIZE);
	printf("number of iterations: %d\n", iterations);
	printf("Time : %lf sec\n",
	       (double)(end.tv_sec - start.tv_sec) + (double)(end.tv_usec -
							      start.tv_usec) /
	       1000000.0);

	/* frees the allocated memory */
	free(X_old);
	free(X);
	clean_matrix(&A);
	clean_vector(&b);

	return 0;
}