Пример #1
0
/*!
  Routine to compute the dot product of two vectors where:

  This is the reference dot-product implementation.  It _CANNOT_ be modified for the
  purposes of this benchmark.

  @param[in] n the number of vector elements (on this processor)
  @param[in] x, y the input vectors
  @param[in] result a pointer to scalar value, on exit will contain result.
  @param[out] time_allreduce the time it took to perform the communication between processes

  @return returns 0 upon success and non-zero otherwise

  @see ComputeDotProduct
*/
int ComputeDotProduct_ref(const local_int_t n, const Vector & x, const Vector & y,
    double & result, double & time_allreduce) {
  assert(x.localLength>=n); // Test vector lengths
  assert(y.localLength>=n);

  double local_result = 0.0;
  double * xv = x.values;
  double * yv = y.values;
  if (yv==xv) {
#ifndef HPCG_NO_OPENMP
    #pragma omp parallel for reduction (+:local_result)
#endif
    for (local_int_t i=0; i<n; i++) local_result += xv[i]*xv[i];
  } else {
#ifndef HPCG_NO_OPENMP
    #pragma omp parallel for reduction (+:local_result)
#endif
    for (local_int_t i=0; i<n; i++) local_result += xv[i]*yv[i];
  }

#ifndef HPCG_NO_MPI
  // Use MPI's reduce function to collect all partial sums
  double t0 = mytimer();
  double global_result = 0.0;
  MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM,
      MPI_COMM_WORLD);
  result = global_result;
  time_allreduce += mytimer() - t0;
#else
  time_allreduce += 0.0;
  result = local_result;
#endif

  return 0;
}
Пример #2
0
int ddot (const int n, const double * const x, const double * const y, 
	  double * const result, double & time_allreduce)
{  
  double local_result = 0.0;
  if (y==x)
    for (int i=0; i<n; i++) local_result += x[i]*x[i];
  else
    for (int i=0; i<n; i++) local_result += x[i]*y[i];




  // a little compute modeling
  SSTMAC_compute_loop(0, n, 1);

#ifdef USING_MPI
  // Use MPI's reduce function to collect all partial sums
  double t0 = mytimer();
  double global_result = 0.0;
  int rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 // std::cout << "rank " << rank << " computed local ddot as  " << local_result << "\n";
  MPI_Allreduce(&local_result, &global_result, 1, MPI_DOUBLE, MPI_SUM, 
                MPI_COMM_WORLD);
  *result = global_result;
  time_allreduce += mytimer() - t0;
#else
  *result = local_result;
#endif

  return(0);
}
void
perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
                     const Box& local_elem_box,
                     MatrixType& A, VectorType& b,
                     Parameters& params)
{
  typedef typename MatrixType::ScalarType Scalar;

  if (A.rows.size() == 0) return;

  int num_threads = params.numthreads;

  timer_type t0 = mytimer();

  //We will iterate the local-element-box (local portion of the mesh), and
  //assemble the FE operators into the global sparse linear-system.
  
  int global_elems_x = mesh.global_box[0][1];
  int global_elems_y = mesh.global_box[1][1];
  int global_elems_z = mesh.global_box[2][1];

  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
  std::vector<GlobalOrdinal> elemIDs(num_elems);

  BoxIterator iter = BoxIterator::begin(local_elem_box);
  BoxIterator end  = BoxIterator::end(local_elem_box);

  for(size_t i=0; iter != end; ++iter, ++i) {
    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
                                       iter.x, iter.y, iter.z);
  }

  LockingMatrix<MatrixType> lockingA(A);
  LockingVector<VectorType> lockingb(b);

  FEAssembleSumInto<GlobalOrdinal,Scalar,MatrixType,VectorType> fe_op;
  fe_op.mesh = &mesh;
  fe_op.elemIDs = &elemIDs[0];
  fe_op.A = &lockingA;
  fe_op.b = &lockingb;
  
  typedef typename VectorType::ComputeNodeType ComputeNodeType;

  ComputeNodeType& compute_node = b.compute_node;

  compute_node.parallel_for(elemIDs.size(), fe_op);

  std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
  std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
}
Пример #4
0
int main(int argc, char *argv[]) {

  int ierr = 0, i;

#ifdef EPETRA_MPI

  // Initialize MPI

  MPI_Init(&argc,&argv);
  int rank; // My process ID

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  Epetra_MpiComm Comm(MPI_COMM_WORLD);

#else

  int rank = 0;
  Epetra_SerialComm Comm;

#endif

#ifdef HAVE_EPETRA_TEUCHOS
  Teuchos::RCP<Teuchos::FancyOStream>
    fancyOut = Teuchos::VerboseObjectBase::getDefaultOStream();
  if (Comm.NumProc() > 1 ) {
    fancyOut->setShowProcRank(true);
    fancyOut->setOutputToRootOnly(-1);
  }
  std::ostream &out = *fancyOut;
#else
  std::ostream &out = std::cout;
#endif

  Comm.SetTracebackMode(0); // This should shut down any error tracing
  bool verbose = false;

  // Check if we should print results to standard out
  if (argc>1) if (argv[1][0]=='-' && argv[1][1]=='v') verbose = true;

  //  char tmp;
  //  if (rank==0) out << "Press any key to continue..."<< endl;
  //  if (rank==0) cin >> tmp;
  //  Comm.Barrier();

  int MyPID = Comm.MyPID();
  int NumProc = Comm.NumProc();

  if (verbose && MyPID==0)
    out << Epetra_Version() << endl << endl;

  if (verbose) out << Comm <<endl;

  bool verbose1 = verbose;

  // Redefine verbose to only print on PE 0
  if (verbose && rank!=0) verbose = false;

  int NumMyElements = 10000;
  int NumMyElements1 = NumMyElements; // Needed for localmap
  int NumGlobalElements = NumMyElements*NumProc+EPETRA_MIN(NumProc,3);
  if (MyPID < 3) NumMyElements++;
  int IndexBase = 0;
  int ElementSize = 7;

  // Test LocalMap constructor
  // and Petra-defined uniform linear distribution constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_LocalMap(NumMyElements1, IndexBase, Comm)" << endl;
  if (verbose) out << "     and Epetra_BlockMap(NumGlobalElements, ElementSize, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Epetra_LocalMap *LocalMap = new Epetra_LocalMap(NumMyElements1, IndexBase,
                              Comm);
  Epetra_BlockMap * BlockMap = new Epetra_BlockMap(NumGlobalElements, ElementSize, IndexBase, Comm);
  EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr);

  delete BlockMap;

  // Test User-defined linear distribution constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, ElementSize, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, ElementSize, IndexBase, Comm);

  EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr);

  delete BlockMap;

  // Test User-defined arbitrary distribution constructor
  // Generate Global Element List.  Do in reverse for fun!

  int * MyGlobalElements = new int[NumMyElements];
  int MaxMyGID = (Comm.MyPID()+1)*NumMyElements-1+IndexBase;
  if (Comm.MyPID()>2) MaxMyGID+=3;
  for (i = 0; i<NumMyElements; i++) MyGlobalElements[i] = MaxMyGID-i;

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements,  ElementSize, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSize,
		      IndexBase, Comm);
  EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr);

  delete BlockMap;

  int * ElementSizeList = new int[NumMyElements];
  int NumMyEquations = 0;
  int NumGlobalEquations = 0;
  for (i = 0; i<NumMyElements; i++)
    {
      ElementSizeList[i] = i%6+2; // blocksizes go from 2 to 7
      NumMyEquations += ElementSizeList[i];
    }
  ElementSize = 7; // Set to maximum for use in checkmap
  NumGlobalEquations = Comm.NumProc()*NumMyEquations;

  // Adjust NumGlobalEquations based on processor ID
  if (Comm.NumProc() > 3)
    {
      if (Comm.MyPID()>2)
	NumGlobalEquations += 3*((NumMyElements)%6+2);
      else
	NumGlobalEquations -= (Comm.NumProc()-3)*((NumMyElements-1)%6+2);
    }

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements,  ElementSizeList, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  BlockMap = new Epetra_BlockMap(NumGlobalElements, NumMyElements, MyGlobalElements, ElementSizeList,
		      IndexBase, Comm);
  EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr);

  // Test Copy constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_BlockMap(*BlockMap)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Epetra_BlockMap * BlockMap1 = new Epetra_BlockMap(*BlockMap);

  EPETRA_TEST_ERR(VectorTests(*BlockMap, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*BlockMap, *LocalMap, verbose),ierr);

  delete [] ElementSizeList;
  delete [] MyGlobalElements;
  delete BlockMap;
  delete BlockMap1;


  // Test Petra-defined uniform linear distribution constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_Map(NumGlobalElements, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Epetra_Map * Map = new Epetra_Map(NumGlobalElements, IndexBase, Comm);
  EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr);

  delete Map;

  // Test User-defined linear distribution constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_Map(NumGlobalElements, NumMyElements, IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Map = new Epetra_Map(NumGlobalElements, NumMyElements, IndexBase, Comm);

  EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr);

  delete Map;

  // Test User-defined arbitrary distribution constructor
  // Generate Global Element List.  Do in reverse for fun!

  MyGlobalElements = new int[NumMyElements];
  MaxMyGID = (Comm.MyPID()+1)*NumMyElements-1+IndexBase;
  if (Comm.MyPID()>2) MaxMyGID+=3;
  for (i = 0; i<NumMyElements; i++) MyGlobalElements[i] = MaxMyGID-i;

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_Map(NumGlobalElements, NumMyElements, MyGlobalElements,  IndexBase, Comm)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Map = new Epetra_Map(NumGlobalElements, NumMyElements, MyGlobalElements,
		      IndexBase, Comm);
  EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr);

  // Test Copy constructor

  if (verbose) out << "\n*********************************************************" << endl;
  if (verbose) out << "Checking Epetra_Map(*Map)" << endl;
  if (verbose) out << "*********************************************************" << endl;

  Epetra_Map Map1(*Map);

  EPETRA_TEST_ERR(VectorTests(*Map, verbose),ierr);

  EPETRA_TEST_ERR(MatrixTests(*Map, *LocalMap, verbose),ierr);

  delete [] MyGlobalElements;
  delete Map;

  if (verbose1)
    {
      // Test Vector MFLOPS for 2D Dot Product
      int M = 1;
      int K = 1000000;
      Epetra_Map Map2(-1, K, IndexBase, Comm);
      Epetra_LocalMap Map3(M, IndexBase, Comm);

      Epetra_Vector A(Map2);A.Random();
      Epetra_Vector B(Map2);B.Random();
      Epetra_Vector C(Map3);C.Random();

      // Test Epetra_Vector label
      const char* VecLabel = A.Label();
      const char* VecLabel1 = "Epetra::Vector";
      if (verbose) out << endl << endl <<"This should say " << VecLabel1 << ": " << VecLabel << endl << endl << endl;
      EPETRA_TEST_ERR(strcmp(VecLabel1,VecLabel),ierr);
      if (verbose) out << "Testing Assignment operator" << endl;

      double tmp1 = 1.00001* (double) (MyPID+1);
      double tmp2 = tmp1;
      A[1] = tmp1;
      tmp2 = A[1];
      out << "On PE "<< MyPID << "  A[1] should equal = " << tmp1;
      if (tmp1==tmp2) out << " and it does!" << endl;
      else out << " but it equals " << tmp2;

      Comm.Barrier();
	
      if (verbose) out << endl << endl << "Testing MFLOPs" << endl;
      Epetra_Flops counter;
      C.SetFlopCounter(counter);
      Epetra_Time mytimer(Comm);
      C.Multiply('T', 'N', 0.5, A, B, 0.0);
      double Multiply_time = mytimer.ElapsedTime();
      double Multiply_flops = C.Flops();
      if (verbose) out << "\n\nTotal FLOPs = " << Multiply_flops << endl;
      if (verbose) out << "Total Time  = " << Multiply_time << endl;
      if (verbose) out << "MFLOPs      = " << Multiply_flops/Multiply_time/1000000.0 << endl;

      Comm.Barrier();
	
      // Test Vector ostream operator with Petra-defined uniform linear distribution constructor
      // and a small vector

      Epetra_Map Map4(100, IndexBase, Comm);
      double * Dp = new double[100];
      for (i=0; i<100; i++)
	Dp[i] = i;
      Epetra_Vector D(View, Map4,Dp);
	
      if (verbose) out << "\n\nTesting ostream operator:  Multivector  should be 100-by-2 and print i,j indices"
	   << endl << endl;
      out << D << endl;

      if (verbose) out << "Traceback Mode value = " << D.GetTracebackMode() << endl;
      delete [] Dp;
    }

#ifdef EPETRA_MPI
  MPI_Finalize();
#endif

  return ierr;

}
Пример #5
0
/*!
  Routine to compute an approximate solution to Ax = b

  @param[in]    geom The description of the problem's geometry.
  @param[inout] A    The known system matrix
  @param[inout] data The data structure with all necessary CG vectors preallocated
  @param[in]    b    The known right hand side vector
  @param[inout] x    On entry: the initial guess; on exit: the new approximate solution
  @param[in]    max_iter  The maximum number of iterations to perform, even if tolerance is not met.
  @param[in]    tolerance The stopping criterion to assert convergence: if norm of residual is <= to tolerance.
  @param[out]   niters    The number of iterations actually performed.
  @param[out]   normr     The 2-norm of the residual vector after the last iteration.
  @param[out]   normr0    The 2-norm of the residual vector before the first iteration.
  @param[out]   times     The 7-element vector of the timing information accumulated during all of the iterations.
  @param[in]    doPreconditioning The flag to indicate whether the preconditioner should be invoked at each iteration.

  @return Returns zero on success and a non-zero value otherwise.

  @see CG_ref()
*/
int CG(const SparseMatrix & A, CGData & data, const Vector & b, Vector & x,
    const int max_iter, const double tolerance, int & niters, double & normr, double & normr0,
    double * times, bool doPreconditioning) {

  double t_begin = mytimer();  // Start timing right away
  normr = 0.0;
  double rtz = 0.0, oldrtz = 0.0, alpha = 0.0, beta = 0.0, pAp = 0.0;


  double t0 = 0.0, t1 = 0.0, t2 = 0.0, t3 = 0.0, t4 = 0.0, t5 = 0.0;
//#ifndef HPCG_NOMPI
//  double t6 = 0.0;
//#endif
  local_int_t nrow = A.localNumberOfRows;
  Vector & r = data.r; // Residual vector
  Vector & z = data.z; // Preconditioned residual vector
  Vector & p = data.p; // Direction vector (in MPI mode ncol>=nrow)
  Vector & Ap = data.Ap;

  if (!doPreconditioning && A.geom->rank==0) HPCG_fout << "WARNING: PERFORMING UNPRECONDITIONED ITERATIONS" << std::endl;

#ifdef HPCG_DEBUG
  int print_freq = 1;
  if (print_freq>50) print_freq=50;
  if (print_freq<1)  print_freq=1;
#endif
  // p is of length ncols, copy x to p for sparse MV operation
  CopyVector(x, p);     //TODO paralel
  TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p
  TICK(); ComputeWAXPBY(nrow, 1.0, b, -1.0, Ap, r, A.isWaxpbyOptimized);  TOCK(t2); // r = b - Ax (x stored in p)
  TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1);
  normr = sqrt(normr);
#ifdef HPCG_DEBUG
  if (A.geom->rank==0) HPCG_fout << "Initial Residual = "<< normr << std::endl;
#endif

  // Record initial residual for convergence testing
  normr0 = normr;

  // Start iterations

  for (int k=1; k<=max_iter && normr/normr0 > tolerance; k++ ) {
    TICK();
    if (doPreconditioning)
      ComputeMG(A, r, z); // Apply preconditioner
    else
      CopyVector (r, z); // copy r to z (no preconditioning)
    TOCK(t5); // Preconditioner apply time

    if (k == 1) {
      TICK(); ComputeWAXPBY(nrow, 1.0, z, 0.0, z, p, A.isWaxpbyOptimized); TOCK(t2); // Copy Mr to p
      TICK(); ComputeDotProduct (nrow, r, z, rtz, t4, A.isDotProductOptimized); TOCK(t1); // rtz = r'*z
    } else {
      oldrtz = rtz;
      TICK(); ComputeDotProduct (nrow, r, z, rtz, t4, A.isDotProductOptimized); TOCK(t1); // rtz = r'*z
      beta = rtz/oldrtz;
      TICK(); ComputeWAXPBY (nrow, 1.0, z, beta, p, p, A.isWaxpbyOptimized);  TOCK(t2); // p = beta*p + z
    }

    TICK(); ComputeSPMV(A, p, Ap); TOCK(t3); // Ap = A*p
    TICK(); ComputeDotProduct(nrow, p, Ap, pAp, t4, A.isDotProductOptimized); TOCK(t1); // alpha = p'*Ap
    alpha = rtz/pAp;
    TICK(); ComputeWAXPBY(nrow, 1.0, x, alpha, p, x, A.isWaxpbyOptimized);// x = x + alpha*p
    ComputeWAXPBY(nrow, 1.0, r, -alpha, Ap, r, A.isWaxpbyOptimized);  TOCK(t2);// r = r - alpha*Ap
    TICK(); ComputeDotProduct(nrow, r, r, normr, t4, A.isDotProductOptimized); TOCK(t1);
    normr = sqrt(normr);
#ifdef HPCG_DEBUG
    if (A.geom->rank==0 && (k%print_freq == 0 || k == max_iter))
      HPCG_fout << "Iteration = "<< k << "   Scaled Residual = "<< normr/normr0 << std::endl;
#endif
    niters = k;
  }

  // Store times
  times[1] += t1; // dot-product time
  times[2] += t2; // WAXPBY time
  times[3] += t3; // SPMV time
  times[4] += t4; // AllReduce time
  times[5] += t5; // preconditioner apply time
//#ifndef HPCG_NOMPI
//  times[6] += t6; // exchange halo time
//#endif
  times[0] += mytimer() - t_begin;  // Total time. All done...
  return(0);
}
Пример #6
0
/// main.
int main(int argc, char** argv) {
    uint32_t maxrow=0, nrpd=0;
    int nparts, ndigits, ierr;

    // must have the output directory name
    if (argc < 2) {
	std::cerr << "\nUsage:\n" << *argv
		  << " <output-dir> [#rows [#rows-per-dir [conf-file]]]\n"
		  << "If the 4th argument is not provided, the number of "
	    "rows per directory will be determined by the memory cache size, "
	    "which is by default 1/2 of the physical memory size.\n"
		  << std::endl;
	return -1;
    }

    //ibis::gVerbose = 8;
    // initialize the file manage with the 5th argument
    ibis::init(argc>4 ? argv[4] : (const char*)0);
    ibis::util::timer mytimer(*argv, 0);
    if (argc > 2) // user specified maxrow
	maxrow = (uint32_t)atof(argv[2]);
    if (maxrow <= 0) {
	double tmp = ibis::fileManager::currentCacheSize();
	maxrow = (uint32_t)
	    ibis::util::compactValue(tmp / 120.0, tmp / 80.0);
	nrpd = maxrow;
    }
    if (maxrow < 10)
	maxrow = 10;
    if (argc > 3) // user specified nrpd
	nrpd = (uint32_t) atof(argv[3]);
    if (nrpd <= 0) {
	double tmp = ibis::fileManager::currentCacheSize();
	nrpd = (uint32_t)
	    ibis::util::compactValue(tmp / 120.0, tmp / 80.0);
    }
    if (nrpd > maxrow) nrpd = maxrow;

    ibis::table::row val;
    std::auto_ptr<ibis::tablex> tab(ibis::tablex::create());
    initColumns(*tab, val);
    ierr = tab->reserveBuffer(nrpd);
    if (ierr > 0 && (unsigned)ierr < nrpd)
	nrpd = ierr;
    LOGGER(1) << *argv << ' ' << argv[1] << ' ' << maxrow << ' ' << nrpd
	      << std::endl;
    nparts = maxrow / nrpd;
    nparts += (maxrow > nparts*nrpd);
    ierr = nparts;
    for (ndigits = 1, ierr >>= 4; ierr > 0; ierr >>= 4, ++ ndigits);
    for (uint32_t irow = 1; irow <= maxrow;) {
	const uint32_t end = irow - 1 + nrpd;
	TDList tdl;
	std::string dir = argv[1];
	if (nparts > 1) { // figure out the directory name
	    const char* str = strrchr(argv[1], FASTBIT_DIRSEP);
	    if (str != 0) {
		if (str[1] == 0) {
		    while (str-1 > argv[1]) {
			if (*(str-1) == FASTBIT_DIRSEP) break;
			else -- str;
		    }
		}
		else {
		    ++ str;
		}
	    }
	    std::ostringstream oss;
	    oss << FASTBIT_DIRSEP << (str ? str : "_") << std::hex
		<< std::setprecision(ndigits) << std::setw(ndigits)
		<< std::setfill('0') << irow / nrpd;
	    dir += oss.str();
	}

	for (; irow <= end; ++ irow) {
	    fillRow(irow, val, tdl);
	    ierr = tab->appendRow(val);
	    LOGGER(ierr != 6)
		<< "Warning -- " << *argv << " failed to append row " << irow
		<< " to the in-memory table, appendRow returned " << ierr;
	    LOGGER(irow % 100000 == 0) << " . " << irow;
	}
	LOGGER(1) << "\n";
	ierr = tab->write(dir.c_str());
	LOGGER(ierr < 0)
	    << "Warning -- " << *argv << " failed to write " << tab->mRows()
	    << " rows to " << dir << ", ibis::tablex::write returned " << ierr;
	writeTDList(tdl, dir.c_str());
	tab->clearData();
	tdl.clear();
    }
    return 0;
} // main
Пример #7
0
void timer_expired()
{
    mytimer();
    keep_going = 0;
}
Пример #8
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  char* str;
  int ngpu = 2;
  int local_rank = 0;
  int device = 0;
  int skip_gpu = 99999;
  if((str = getenv("CUDA_NGPU")) != NULL) {
    ngpu = atoi(str);
  }
  if((str = getenv("CUDA_SKIP_GPU")) != NULL) {
    skip_gpu = atoi(str);
  }
  if((str = getenv("SLURM_LOCALID")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }
  if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }
  if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
    local_rank = atoi(str);
    device = local_rank % ngpu;
    if(device >= skip_gpu) device++;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  NVAMG_SAFE_CALL(NVAMG_initialize());
  NVAMG_SAFE_CALL(NVAMG_initialize_plugins());
  NVAMG_matrix_handle matrix;
  NVAMG_vector_handle rhs;
  NVAMG_vector_handle soln;
  NVAMG_resources_handle rsrc = NULL;
  NVAMG_solver_handle solver = NULL;
  NVAMG_config_handle config;
  NVAMG_SAFE_CALL(NVAMG_config_create_from_file(&config,"NVAMG_CONFIG" ));

  MPI_Comm nvamg_comm;
  MPI_Comm_dup(MPI_COMM_WORLD, &nvamg_comm);
  int devices[] = {device};

  NVAMG_resources_create(&rsrc, config, &nvamg_comm, 1, devices);
  NVAMG_SAFE_CALL(NVAMG_solver_create(&solver, rsrc, NVAMG_mode_dDDI, config));
  NVAMG_SAFE_CALL(NVAMG_matrix_create(&matrix, rsrc, NVAMG_mode_dDDI));
  NVAMG_SAFE_CALL(NVAMG_vector_create(&rhs, rsrc, NVAMG_mode_dDDI));
  NVAMG_SAFE_CALL(NVAMG_vector_create(&soln, rsrc, NVAMG_mode_dDDI));

  //Generating communication Maps for NVAMG
  if(A.neighbors.size()>0) {
    int** send_map = new int*[A.neighbors.size()];
    int** recv_map = new int*[A.neighbors.size()];
    int send_offset = 0;
    int recv_offset = A.row_offsets.size()-1;;
    for(int i = 0; i<A.neighbors.size();i++) {
      send_map[i] = &A.elements_to_send[send_offset];
      send_offset += A.send_length[i];
      recv_map[i] = new int[A.recv_length[i]];
      for(int j=0; j<A.recv_length[i]; j++)
        recv_map[i][j] = recv_offset+j;
      recv_offset += A.recv_length[i];
    }
    const int** send_map_c = (const int**) send_map;
    const int** recv_map_c = (const int**) recv_map;
    NVAMG_SAFE_CALL(NVAMG_matrix_comm_from_maps_one_ring(
      matrix, 1, A.neighbors.size(),A.neighbors.data(),
      A.send_length.data(), send_map_c,
      A.recv_length.data(), recv_map_c));
    NVAMG_SAFE_CALL(NVAMG_vector_bind(rhs,matrix));
    NVAMG_SAFE_CALL(NVAMG_vector_bind(soln,matrix));
    for(int i=0; i<A.neighbors.size(); i++)
      delete [] recv_map[i];

  }

  for(int i=0;i<x.coefs.size();i++) x.coefs[i]=1;

  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot_r2(r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }
  {

    //Matrix upload needs to happen before vector, otherwise it crashes
    NVAMG_SAFE_CALL(NVAMG_matrix_upload_all(matrix,A.row_offsets.size()-1, A.packed_coefs.size(),1,1, &A.row_offsets[0],&A.packed_cols[0],&A.packed_coefs[0], NULL));
    NVAMG_SAFE_CALL(NVAMG_vector_upload(soln, p.coefs.size(), 1, &p.coefs[0]));
    NVAMG_SAFE_CALL(NVAMG_vector_upload(rhs, b.coefs.size(), 1, &b.coefs[0]));

    int n = 0;
    int bsize_x = 0, bsize_y = 0;

    NVAMG_SAFE_CALL(NVAMG_solver_setup(solver, matrix));
    NVAMG_SAFE_CALL(NVAMG_solver_solve(solver, rhs, soln));
    NVAMG_SAFE_CALL(NVAMG_vector_download(soln, &x.coefs[0]));

    int niter;
    NVAMG_SAFE_CALL(NVAMG_solver_get_iterations_number(solver, &niter));

    TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
    TICK();
    matvec(A, p, Ap);
    TOCK(tMATVEC);

    TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

    TICK(); rtrans = dot_r2(r); TOCK(tDOT);

    normr = std::sqrt(rtrans);

    if (myproc == 0) {
      std::cout << "Final Residual = "<< normr << " after " << niter << " iterations" << std::endl;
    }
   }

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
Пример #9
0
int main(int argc, char* argv[])
{
  int i, j, loop, num_alive, maxloop;
  int ldboard, ldnbngb, ldlboard;
  double t1, t2;
  double temps;
 
  int *board;
  int *nbngb;

  int local_alive;
  int *global_board;

  struct grid grid;
  MPI_Comm comm;
  int nb_proc_row;
  int nb_proc_tot;
  int rank;
  int nb_in_block;

  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD, &nb_proc_tot);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);

  // initialization of the grid communicator
  if (EXIT_FAILURE == compute_communicator(nb_proc_tot,&nb_proc_row,&comm,&rank)){
    MPI_Finalize();
    return EXIT_SUCCESS;
  }


  if (argc < 2) {
    maxloop = 10;
  } else if (argc > 2){
    maxloop = atoi(argv[1]);
    BS = atoi(argv[2]);
  } else
    maxloop = atoi(argv[1]);
  num_alive = 0;
  local_alive = 0;

  /* Leading dimension of the board array */
  ldboard = BS;
  if (ldboard % nb_proc_row != 0){
    if (rank == 0)
      printf("Wrong BS (or wrong number of procs) ... exiting now.\n");
    MPI_Finalize();
    return EXIT_FAILURE;
  }

  /* Leading dimension of the neigbour counters array */
  nb_in_block = ldboard / nb_proc_row;
  ldnbngb = nb_in_block;
  ldlboard = nb_in_block + 2;

  board = malloc( ldlboard * ldlboard * sizeof(int) );
  nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) );

  if (rank == 0){
    global_board = malloc( ldboard * ldboard * sizeof(int) );
    num_alive = generate_initial_board( &global_cell( 1, 1), ldboard );
    printf("Starting number of living cells = %d\n", num_alive);
    t1 = mytimer();
  }

  matrix_placement_proc(nb_proc_row, nb_in_block, &comm, &(global_cell( 1, 1)), &(cell( 1, 1)), SCATTER, ldlboard);

  mpi_grid_init(&comm, &grid, rank);
  //printf("rank #%d: %d %d\n", rank, grid.rank_I, grid.rank_J);


  //output_lboard( nb_in_block, board, ldlboard, 0, rank );

  for (loop = 1; loop <= maxloop; loop++) {

    MPI_Datatype blocktype; // we need a specific type for row exchange
    MPI_Type_vector(nb_in_block, 1, ldlboard, MPI_INT, &blocktype);
    MPI_Type_commit(&blocktype);
    // for upper/lower ghost row
    MPI_Sendrecv(&(cell( 1, 1)), 1, blocktype, grid.proc_above, 99, 
		 &(cell( nb_in_block+1, 1)), 1, blocktype, grid.proc_under, 99,
		 comm, MPI_STATUS_IGNORE);
    MPI_Sendrecv(&(cell( nb_in_block, 1)), 1, blocktype, grid.proc_under, 99,
		 &(cell( 0, 1)), 1, blocktype, grid.proc_above, 99, 
		 comm, MPI_STATUS_IGNORE);

    // for left/right ghost col
    MPI_Sendrecv(&(cell( 0, 1)), ldlboard, MPI_INT, grid.proc_left, 98, 
		 &(cell( 0, nb_in_block+1)), ldlboard, MPI_INT, grid.proc_right, 98,
		 comm, MPI_STATUS_IGNORE);
    MPI_Sendrecv(&(cell( 0, nb_in_block)), ldlboard, MPI_INT, grid.proc_right, 98,
		 &(cell( 0, 0)), ldlboard, MPI_INT, grid.proc_left, 98, 
		 comm, MPI_STATUS_IGNORE);

    //debug
    /* if (loop == 1) */
    /*   output_lboard( nb_in_block, board, ldlboard, 0, rank ); */

    //calcul du nombre de voisins
    for (j = 1; j <= nb_in_block; j++) {
      for (i = 1; i <= nb_in_block; i++) {
  	ngb( i, j ) =
  	  cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
  	  cell( i-1, j   ) +                  cell( i+1, j   ) +
  	  cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
      }
    }

    //mise à jour de la matrice
    local_alive = 0;
    for (j = 1; j <= nb_in_block; j++) {
      for (i = 1; i <= nb_in_block; i++) {
  	if ( (ngb( i, j ) < 2) ||
  	     (ngb( i, j ) > 3) ) {
  	  cell(i, j) = 0;
  	}
  	else {
  	  if ((ngb( i, j )) == 3)
  	    cell(i, j) = 1;
  	}
  	if (cell(i, j) == 1) {
  	  local_alive ++;
  	}
      }
    }

    //output_lboard( nb_in_block, board, ldlboard, loop, rank );
#ifdef PRINT_ALIVE
    MPI_Reduce(&local_alive, &num_alive, 1, MPI_INT, MPI_SUM, 0, comm);
    if (rank == 0)
      printf("%d \n", num_alive);
#endif
  }

  matrix_placement_proc(nb_proc_row, nb_in_block, &comm, &(cell( 1, 1)), &(global_cell( 1, 1)), GATHER, ldlboard);
  MPI_Reduce(&local_alive, &num_alive, 1, MPI_INT, MPI_SUM, 0, comm);

  if (rank == 0){
    t2 = mytimer();
    temps = t2 - t1;
    printf("Final number of living cells = %d\n", num_alive);
    printf("time=%.2lf ms\n",(double)temps * 1.e3);
    
    //output_board( BS, &(global_cell(1, 1)), ldboard, maxloop);
    free(global_board);
  }
  free(board);
  free(nbngb);

  MPI_Comm_free(&comm);
  MPI_Finalize();

  return EXIT_SUCCESS;
}
Пример #10
0
int main(int argc, char* argv[])
{
    int num_alive = 0;
    int ldboard, ldnbngb;
    double t1, t2;
    double temps;

    if (argc < 3) {
	printf("Usage: %s nb_iterations size [nb_threads]\n", argv[0]);
	return EXIT_SUCCESS;
    } else {
	maxloop = atoi(argv[1]);
	BS = atoi(argv[2]);
	//printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop);
    }
    if(argc > 3)
      nb_threads = atoi(argv[3]);
    num_alive = 0;

    /* Leading dimension of the board array */
    ldboard = BS + 2;
    /* Leading dimension of the neigbour counters array */
    ldnbngb = BS;

    _board = malloc( ldboard * ldboard * sizeof(int) );
    _nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) );
    int *board = _board;
    num_alive = generate_initial_board( BS, &(cell(1, 1)), ldboard );

    pthread_t *threads=malloc(nb_threads*sizeof(*threads));
    nbdone = malloc(nb_threads*sizeof(*nbdone));
    for(int i=0; i<nb_threads; i++)
      sem_init(nbdone+i, 0, 0);
    pthread_cond_init(&barrier_cond, NULL);
    pthread_mutex_init(&barrier_mut, NULL);

    printf("Starting number of living cells = %d\n", num_alive);
    t1 = mytimer();

    for(int i=0; i<nb_threads; i++){
      int *id = malloc(sizeof(*id));
      *id = i;
      pthread_create(threads+i, NULL, thread_compute, (void *)id);
    }

    num_alive = 0;
    for(int i=0; i<nb_threads; i++){
      void *result_alive;
      pthread_join(threads[i], &result_alive);
      num_alive += *(int*)result_alive;
      free(result_alive);
    }

    t2 = mytimer();
    temps = t2 - t1;
    printf("Final number of living cells = %d\n", num_alive);
    printf("%.2lf\n",(double)temps * 1.e3);

    free(_board);
    free(_nbngb);
    return EXIT_SUCCESS;
}
Пример #11
0
/**
   Write the content of the dictionary to the named file.  The existing
   content in the named file is overwritten.  The content of the dictionary
   file is laid out as follows.

   \li Signature "#IBIS Dictionary " and version number (currently
   0x020000). (20 bytes)

   \li N = Number of strings in the file. (4 bytes)

   \li uint64_t[N+1]: the starting positions of the strings in this file.

   \li uint32_t[N]: The integer code corresponding to each string value.

   \li the string values packed one after the other with their nil
   terminators.
*/
int ibis::dictionary::write(const char* name) const {
    std::string evt = "dictionary::write";
    if (name == 0 || *name == 0) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt << " can not proceed with a "
            "null string as the file name";
        return -1;
    }
    if (ibis::gVerbose > 1) {
        evt += '(';
        evt += name;
        evt += ')';
    }
    if (key_.size() > raw_.size()) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt
            << " can not write an inconsistent dictionary, key_.size("
            << key_.size() << "), raw_.size(" << raw_.size() << ")";
        return -2;
    }

    ibis::util::timer mytimer(evt.c_str(), 4);
    FILE* fptr = fopen(name, "wb");
    if (fptr == 0) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt << " failed to open the file ... "
            << (errno ? strerror(errno) : "no free stdio stream");
        return -3;
    }

    IBIS_BLOCK_GUARD(fclose, fptr);
    int ierr = fwrite(_fastbit_dictionary_header, 1, 20, fptr);
    if (ierr != 20) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt
            << " failed to write the header, fwrite returned " << ierr;
        return -4;
    }

    const uint32_t nkeys = key_.size();
    ierr = fwrite(&nkeys, sizeof(nkeys), 1, fptr);
    if (ierr != 1) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt << " failed to write the size(" << nkeys
            << "), fwrite returned " << ierr;
        return -5;
    }
    if (nkeys == 0) // nothing else to write
        return 0;

    mergeBuffers();
    array_t<uint64_t> pos(nkeys+1);
    array_t<uint32_t> qos(nkeys);

    pos.clear();
    qos.clear();
    pos.push_back(0);
    if (buffer_.size() == 1) {
        for (uint32_t j = 0; j < raw_.size(); ++ j) {
            if (raw_[j] != 0) {
                pos.push_back(1U + strlen(raw_[j]));
                qos.push_back(j);
            }
        }
        ierr = writeBuffer(fptr, nkeys, pos, qos);
    }
    else {
        ierr = writeKeys(fptr, nkeys, pos, qos);
    }
    LOGGER(ibis::gVerbose > 1)
        << evt << " complete with ierr = " << ierr;
    return ierr;
} // ibis::dictionary::write
Пример #12
0
/// Read the content of the named file.  The file content is read into the
/// buffer in one-shot and then digested.
int ibis::dictionary::read(const char* name) {
    if (name == 0 || *name == 0) return -1;
    std::string evt = "dictionary::read(";
    evt += name;
    evt += ')';
    // open the file to read
    int ierr = 0;
    FILE* fptr = fopen(name, "rb");
    if (fptr == 0) {
	LOGGER(ibis::gVerbose > 3)
	    << "Warning -- " << evt << " failed to open the file ... "
	    << (errno ? strerror(errno) : "no free stdio stream");
	return -2;
    }

    ibis::util::timer mytimer(evt.c_str(), 4);
    IBIS_BLOCK_GUARD(fclose, fptr);
    ierr = fseek(fptr, 0, SEEK_END); // to the end
    if (ierr != 0) {
	LOGGER(ibis::gVerbose > 1)
	    << "Warning -- " << evt << " failed to seek to the end of the file";
	return -3;
    }

    long int sz = ftell(fptr); // file size
    if (sz < 24) { // must be the old style dictionary file
	return readRaw(evt.c_str(), fptr);
    }
    else {
	char header[20];
	ierr = fseek(fptr, 0, SEEK_SET);
	if (ierr != 0) {
	    LOGGER(ibis::gVerbose > 1)
		<< "Warning -- " << evt << " failed to seek to the beginning "
		"of the file";
	    return -4;
	}

	ierr = fread(header, 1, 20, fptr);
	if (ierr != 20) {
	    LOGGER(ibis::gVerbose > 1)
		<< "Warning -- " << evt << " failed to read the 20-byte header";
	    return -5;
	}
	if (header[0] == _fastbit_dictionary_header[0] &&
	    header[1] == _fastbit_dictionary_header[1] &&
	    header[2] == _fastbit_dictionary_header[2] &&
	    header[3] == _fastbit_dictionary_header[3] &&
	    header[4] == _fastbit_dictionary_header[4] &&
	    header[5] == _fastbit_dictionary_header[5] &&
	    header[6] == _fastbit_dictionary_header[6] &&
	    header[7] == _fastbit_dictionary_header[7] &&
	    header[8] == _fastbit_dictionary_header[8] &&
	    header[9] == _fastbit_dictionary_header[9] &&
	    header[10] == _fastbit_dictionary_header[10] &&
	    header[11] == _fastbit_dictionary_header[11] &&
	    header[12] == _fastbit_dictionary_header[12] &&
	    header[13] == _fastbit_dictionary_header[13] &&
	    header[14] == _fastbit_dictionary_header[14] &&
	    header[15] == _fastbit_dictionary_header[15] &&
	    header[16] == _fastbit_dictionary_header[16] &&
	    header[17] == _fastbit_dictionary_header[17] &&
	    header[18] == _fastbit_dictionary_header[18] &&
	    header[19] == _fastbit_dictionary_header[19]) {
	    // got the expected header
	    return readKeys(evt.c_str(), fptr);
	}
	else {
	    LOGGER(ibis::gVerbose > 2)
		<< evt << " did not find the expected header, assume "
		"to be an old-style dictionary";
	    return readRaw(evt.c_str(), fptr);
	}
    }
} // ibis::dictionary::read
Пример #13
0
/// Read the content of the named file.  The file content is read into the
/// buffer in one-shot and then digested.
///
/// This function determines the version of the dictionary and invokes the
/// necessary reading function to perform the actual reading operations.
/// Currently there are three possible version of dictioanries
/// 0x02000000 - the version produced by the current write function,
/// 0x01000000 - the version with 64-bit offsets, consecutive kyes, strings
///              are stored in key order
/// 0x00000000 - the version 32-bit offsets and stores strings in
///              sorted order.
/// unmarked   - the version without a header, only has the bare strings in
///              the code order.
int ibis::dictionary::read(const char* name) {
    if (name == 0 || *name == 0) return -1;
    std::string evt = "dictionary::read";
    if (ibis::gVerbose > 1) {
        evt += '(';
        evt += name;
        evt += ')';
    }

    // open the file to read
    int ierr = 0;
    FILE* fptr = fopen(name, "rb");
    if (fptr == 0) {
        LOGGER(ibis::gVerbose > 3)
            << "Warning -- " << evt << " failed to open the file ... "
            << (errno ? strerror(errno) : "no free stdio stream");
        return -2;
    }

    ibis::util::timer mytimer(evt.c_str(), 4);
    IBIS_BLOCK_GUARD(fclose, fptr);
    ierr = fseek(fptr, 0, SEEK_END); // to the end
    if (ierr != 0) {
        LOGGER(ibis::gVerbose > 1)
            << "Warning -- " << evt << " failed to seek to the end of the file";
        return -3;
    }

    uint32_t version = 0xFFFFFFFFU;
    long int sz = ftell(fptr); // file size
    if (sz > 24) {
        char header[20];
        ierr = fseek(fptr, 0, SEEK_SET);
        if (ierr != 0) {
            LOGGER(ibis::gVerbose > 1)
                << "Warning -- " << evt << " failed to seek to the beginning "
                "of the file";
            return -4;
        }

        ierr = fread(header, 1, 20, fptr);
        if (ierr != 20) {
            LOGGER(ibis::gVerbose > 1)
                << "Warning -- " << evt << " failed to read the 20-byte header";
            return -5;
        }
        if (header[0] == _fastbit_dictionary_header[0] &&
            header[1] == _fastbit_dictionary_header[1] &&
            header[2] == _fastbit_dictionary_header[2] &&
            header[3] == _fastbit_dictionary_header[3] &&
            header[4] == _fastbit_dictionary_header[4] &&
            header[5] == _fastbit_dictionary_header[5] &&
            header[6] == _fastbit_dictionary_header[6] &&
            header[7] == _fastbit_dictionary_header[7] &&
            header[8] == _fastbit_dictionary_header[8] &&
            header[9] == _fastbit_dictionary_header[9] &&
            header[10] == _fastbit_dictionary_header[10] &&
            header[11] == _fastbit_dictionary_header[11] &&
            header[12] == _fastbit_dictionary_header[12] &&
            header[13] == _fastbit_dictionary_header[13] &&
            header[14] == _fastbit_dictionary_header[14] &&
            header[15] == _fastbit_dictionary_header[15]) {
            version = (header[16] << 24 | header[17] << 16 |
                       header[18] << 8 | header[19]);
            LOGGER(ibis::gVerbose > 3)
                << evt << " detected dictionary version 0x" << std::hex
                << version << std::dec;
        }
        else {
            LOGGER(ibis::gVerbose > 2)
                << evt << " did not find the expected header, assume "
                "to have no header (oldest version of dictioinary)";
        }
    }

    // invoke the actual reader based on version number
    switch (version) {
    case 0x02000000:
            ierr = readKeys2(evt.c_str(), fptr);
            break;
    case 0x01000000:
            ierr = readKeys1(evt.c_str(), fptr);
            break;
    case 0x00000000:
            ierr = readKeys0(evt.c_str(), fptr);
            break;
    default:
            ierr = readRaw(evt.c_str(), fptr);
            break;
    }
    if (ibis::gVerbose > 3) {
        ibis::util::logger lg;
        lg() << evt << " completed with ";
        toASCII(lg());
    }
    return ierr;
} // ibis::dictionary::read
Пример #14
0
int main(int argc, char* argv[])
{
    int i, j, loop, num_alive, maxloop;
    int ldgboard,ldboard, ldnbngb;
    double t1, t2;
    double temps;
    int *gboard;
    int *board;
    int *nbngb;

    int size;
    int coord[2], id;
    int procs_per_lines_col;

    MPI_Init(NULL,NULL);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    procs_per_lines_col = sqrt(size);
    if(procs_per_lines_col * procs_per_lines_col != size) {
      fprintf(stderr, "Renseignez un nombre carré de processeurs siouplait !\n");
      MPI_Finalize();
      exit(EXIT_FAILURE);
    }

    int dims[2]; dims[0] = procs_per_lines_col; dims[1] = procs_per_lines_col;
    int periods[2]; periods[0] = 1; periods[1] = 1;
    MPI_Comm comm_cart;
    
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_cart);
    MPI_Comm_rank(comm_cart, &id);
    MPI_Cart_coords(comm_cart, id, 2, coord);

    if (argc < 3) {
	printf("Usage: %s nb_iterations size\n", argv[0]);
	return EXIT_SUCCESS;
    } else {
	maxloop = atoi(argv[1]);
	BS = atoi(argv[2]);
	//printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop);
    }
    num_alive = 0;


    //Generate the neighbours table
    
    /* Leading dimension of the global board array */
    ldgboard = BS + 2;
    /* Leading dimension of the board array */
    ldboard = BS/procs_per_lines_col + 2;
    /* Leading dimension of the neigbour counters array */
    ldnbngb = BS/procs_per_lines_col;

    board = malloc( ldboard * ldboard * sizeof(int) );
    nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) );
    
    if(id == 0) {
      gboard = malloc(ldgboard * ldgboard * sizeof(int));
      num_alive = generate_initial_board( BS, &gboard[1+ldgboard], ldgboard );
      //fprintf(stderr,"Starting number of living cells = %d\n", num_alive);
    }

    MPI_Datatype block;
    MPI_Type_vector(ldboard-2, ldboard-2, ldgboard, MPI_INT, &block);
    MPI_Type_create_resized(block, 0, sizeof(int), &block);
    MPI_Type_commit(&block);

    MPI_Datatype subblock;
    MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &subblock);
    MPI_Type_create_resized(subblock, 0, sizeof(int), &subblock);
    MPI_Type_commit(&subblock);
    
    int * counts = (int*) malloc(size*sizeof(int));
    int * displs = (int*) malloc(size*sizeof(int));
    // Définition des déplacements pour chaque proc
    for (int i = 0; i < procs_per_lines_col; ++i)
      {
	for (int j = 0; j < procs_per_lines_col; ++j)
	  {
	    counts[i+j*procs_per_lines_col]= 1;
	    displs[i+j*procs_per_lines_col]= i*ldgboard*(ldboard-2)+j*(ldboard-2);
	  }
      }
    MPI_Scatterv(&gboard[1+ldgboard], counts, displs, block, &board[ldboard+1], 1,
				subblock,0, comm_cart);
    

    int neighbours[8];
    make_neighbours_table(neighbours, comm_cart);    
    MPI_Request req[8];

    int block_size = ldboard - 2;
    MPI_Datatype block_line;
    MPI_Type_vector(block_size+2, 1, ldboard,MPI_INT, &block_line);
    MPI_Type_commit(&block_line);

    t1 = mytimer();

    for (loop = 1; loop <= maxloop; loop++) {
      make_communications(req, comm_cart, neighbours, block_size, board, ldboard, block_line);
	  
	  /*	cell(   0, 0   ) = cell(BS, BS);
	cell(   0, BS+1) = cell(BS,  1);
	cell(BS+1, 0   ) = cell( 1, BS);
	cell(BS+1, BS+1) = cell( 1,  1);

	for (i = 1; i <= BS; i++) {
	    cell(   i,    0) = cell( i, BS);
	    cell(   i, BS+1) = cell( i,  1);
	    cell(   0,    i) = cell(BS,  i);
	    cell(BS+1,    i) = cell( 1,  i);
	}
	  */

      //Inner cells 
	for (j = 2; j <= block_size; j++) {
	    for (i = 2; i <= block_size; i++) {
		ngb( i, j ) =
		    cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
		    cell( i-1, j   ) +                  cell( i+1, j   ) +
		    cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
	    }
	}

	//On LEFT
	MPI_Wait(&req[0], MPI_STATUS_IGNORE);
	MPI_Wait(&req[4], MPI_STATUS_IGNORE);
	MPI_Wait(&req[6], MPI_STATUS_IGNORE);
	//CALCUL LIGNE GAUCHE
	for(j = 1; j <= block_size; j++) {
	  ngb( 1, j ) =
	    cell( 0, j-1 ) + cell( 1, j-1 ) + cell( 2, j-1 ) +
	    cell( 0, j   ) +                  cell( 2, j   ) +
	    cell( 0, j+1 ) + cell( 1, j+1 ) + cell( 2, j+1 );
	}
	
	//On TOP
	MPI_Wait(&req[1], MPI_STATUS_IGNORE);
	MPI_Wait(&req[5], MPI_STATUS_IGNORE);
	//CALCUL LIGNE DESSUS
	for(i = 1; i <= block_size; i++) {
	  ngb( i, 1 ) =
	    cell( i - 1, 0) + cell( i, 0 ) + cell( i + 1, 0 ) +
	    cell( i - 1, 1) +                cell( i + 1, 1 ) +
	    cell( i - 1, 2) + cell( i, 2 ) + cell( i + 1, 2 );
	}


	//On RIGHT
	MPI_Wait(&req[2], MPI_STATUS_IGNORE);
	MPI_Wait(&req[7], MPI_STATUS_IGNORE);
	//CALCULER A DROITE
	for(j = 1; j <= block_size; j++) {
	  ngb( block_size, j ) =
	    cell( block_size - 1, j-1 ) + cell( block_size , j-1 ) + cell( block_size + 1, j-1 ) +
	    cell( block_size - 1, j   ) +                            cell( block_size + 1, j   ) +
	    cell( block_size - 1, j+1 ) + cell( block_size, j+1 ) + cell(  block_size + 1, j+1 );
	}
	

	
	//ON BOT
	MPI_Wait(&req[3], MPI_STATUS_IGNORE);
	//CALCULER EN BAS
	for(i = 1; i <= block_size; i++) {
	  ngb( i, block_size ) =
	    cell( i - 1, block_size - 1) + cell( i, block_size - 1 ) + cell( i + 1, block_size - 1 ) +
	    cell( i - 1, block_size ) +                cell( i + 1, block_size ) +
	    cell( i - 1, block_size + 1 ) + cell( i, block_size + 1 ) + cell( i + 1, block_size + 1 );
	}


	num_alive = 0;
	for (j = 1; j <= block_size; j++) {
	    for (i = 1; i <= block_size; i++) {
		if ( (ngb( i, j ) < 2) ||
		     (ngb( i, j ) > 3) ) {
		    cell(i, j) = 0;
		}
		else {
		    if ((ngb( i, j )) == 3)
			cell(i, j) = 1;
		}
		if (cell(i, j) == 1) {
		    num_alive ++;
		}
	    }
	}

        /* Avec les celluls sur les bords (utile pour vérifier les comm MPI) */
        /* output_board( BS+2, &(cell(0, 0)), ldboard, loop ); */

        /* Avec juste les "vraies" cellules: on commence à l'élément (1,1) */
	//output_board( BS, &(cell(1, 1)), ldboard, loop);

	//printf("%d cells are alive\n", num_alive);
    }
    MPI_Gatherv(&board[ldboard+1], 1, subblock,&gboard[ldgboard+1], counts,displs, block, 0, comm_cart);

    t2 = mytimer();

    temps = t2 - t1;
    MPI_Allreduce(MPI_IN_PLACE,&temps, 1, MPI_DOUBLE, MPI_MAX, comm_cart);
    MPI_Allreduce(MPI_IN_PLACE,&num_alive, 1, MPI_INT, MPI_SUM, comm_cart);
    if(id == 0) {
      //printf("Final number of living cells = %d\n", num_alive);
      printf("%.2lf\n",(double)temps * 1.e3);
    }
    free(board);
    free(nbngb);
    MPI_Finalize();
    return EXIT_SUCCESS;
}
Пример #15
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  VectorType r(b.startIndex, nrows, 256);
  VectorType p(0, ncols, 512);
  VectorType Ap(b.startIndex, nrows, 64);

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

//  print_vec(p.coefs, "p");

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot_r2(r); TOCK(tDOT);

//std::cout << "rtrans="<<rtrans<<std::endl;

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

#ifdef MINIFE_DEBUG_OPENMP
  std::cout << "Starting CG Solve Phase..." << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      //TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
	TICK(); daxpby(one, r, zero, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot_r2(r); TOCK(tDOT);
      const magnitude_type beta = rtrans/oldrtrans;
      TICK(); daxpby(one, r, beta, p); TOCK(tWAXPY);
    }

    normr = sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    if (p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); daxpby(alpha, p, one, x);
            daxpby(-alpha, Ap, one, r); TOCK(tWAXPY);

    num_iters = k;
  }

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}
Пример #16
0
int
driver(const Box& global_box, Box& my_box,
       Parameters& params, YAML_Doc& ydoc)
{
  int global_nx = global_box[0][1];
  int global_ny = global_box[1][1];
  int global_nz = global_box[2][1];

  int numprocs = 1, myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (params.load_imbalance > 0) {
    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
  }

  float largest_imbalance = 0, std_dev = 0;
  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
                                   std_dev, ydoc, true);


  //Create a representation of the mesh:
  //Note that 'simple_mesh_description' is a virtual or conceptual
  //mesh that doesn't actually store mesh data.
#ifdef TIME_IT
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "creating/filling mesh...";
    std::cout.flush();
  }
#endif

  timer_type t_start = mytimer();
  timer_type t0 = mytimer();

  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);

  timer_type mesh_fill = mytimer() - t0;
  timer_type t_total = mytimer() - t_start;

#ifdef TIME_IT
  if (myproc==0) {
    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
  }
#endif

  //next we will generate the matrix structure.

  //Declare matrix object:

#if defined(MINIFE_ELL_MATRIX)
  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#else
  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#endif

  MatrixType A;

  timer_type gen_structure;
  RUN_TIMED_FUNCTION("generating matrix structure...",
                     generate_matrix_structure(mesh, A),
                     gen_structure, t_total);

  GlobalOrdinal local_nrows = A.rows.size();
  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;

  Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
  Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);

  //Assemble finite-element sub-matrices and sub-vectors into the global
  //linear system:

  timer_type fe_assembly;
  RUN_TIMED_FUNCTION("assembling FE data...",
                     assemble_FE_data(mesh, A, b, params),
                     fe_assembly, t_total);

  if (myproc == 0) {
    ydoc.add("Matrix structure generation","");
    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
    ydoc.add("FE assembly","");
    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
  }

#ifdef MINIFE_DEBUG
  write_matrix("A_prebc.mtx", A);
  write_vector("b_prebc.vec", b);
#endif

  //Now apply dirichlet boundary-conditions
  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)

  timer_type dirbc_time;
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A.mtx", A);
  write_vector("b.vec", b);
#endif

  //Transform global indices to local, set up communication information:

  timer_type make_local_time;
  RUN_TIMED_FUNCTION("making matrix indices local...",
                     make_local_matrix(A),
                     make_local_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A_local.mtx", A);
  write_vector("b_local.vec", b);
#endif

  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);

  //Prepare to perform conjugate gradient solve:

  LocalOrdinal max_iters = 200;
  LocalOrdinal num_iters = 0;
  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
  magnitude rnorm = 0;
  magnitude tol = std::numeric_limits<magnitude>::epsilon();

  timer_type cg_times[NUM_TIMERS];

  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;

  t_total = mytimer() - t_start;

  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;

  int verify_result = 0;

#if MINIFE_KERNELS != 0
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "Starting kernel timing loops ..." << std::endl;
  }

  max_iters = 500;
  x.coefs[0] = 0.9;
  if (matvec_with_comm_overlap) {
    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  else {
    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  num_iters = max_iters;
  std::string title("Kernel timings");
#else
  if (myproc==0) {
    std::cout << "Starting CG solver ... " << std::endl;
  }

  if (matvec_with_comm_overlap) {
#ifdef MINIFE_CSR_MATRIX
    rearrange_matrix_local_external(A);
    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
#else
    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
#endif
  }
  else {
    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
    if (myproc == 0) {
      std::cout << "Final Resid Norm: " << rnorm << std::endl;
    }

    if (params.verify_solution > 0) {
      double tolerance = 0.06;
      bool verify_whole_domain = false;
  #ifdef MINIFE_DEBUG
      verify_whole_domain = true;
  #endif
      if (myproc == 0) {
        if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
        else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
      }
      verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
    }
  }

#ifdef MINIFE_DEBUG
  write_vector("x.vec", x);
#endif
  std::string title("CG solve");
#endif

  if (myproc == 0) {
    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
    ydoc.add(title,"");
    ydoc.get(title)->add("Iterations",num_iters);
    ydoc.get(title)->add("Final Resid Norm",rnorm);

    GlobalOrdinal global_nrows = global_nx;
    global_nrows *= global_ny*global_nz;

    //flops-per-mv, flops-per-dot, flops-per-waxpy:
    double mv_flops = global_nnz*2.0;
    double dot_flops = global_nrows*2.0;
    double waxpy_flops = global_nrows*3.0;

#if MINIFE_KERNELS == 0
//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
    mv_flops *= (num_iters+1);
    dot_flops *= (2*num_iters);
    waxpy_flops *= (3*num_iters+2);
#else
//if MINIFE_KERNELS then we did one of each operation per iteration.
    mv_flops *= num_iters;
    dot_flops *= num_iters;
    waxpy_flops *= num_iters;
#endif

    double total_flops = mv_flops + dot_flops + waxpy_flops;

    double mv_mflops = -1;
    if (cg_times[MATVEC] > 1.e-4)
      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);

    double dot_mflops = -1;
    if (cg_times[DOT] > 1.e-4)
      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);

    double waxpy_mflops = -1;
    if (cg_times[WAXPY] > 1.e-4)
      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);

    double total_mflops = -1;
    if (cg_times[TOTAL] > 1.e-4)
      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);

    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
    if (waxpy_mflops >= 0)
      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
    else
      ydoc.get(title)->add("WAXPY Mflops","inf");

    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
    ydoc.get(title)->add("DOT Flops",dot_flops);
    if (dot_mflops >= 0)
      ydoc.get(title)->add("DOT Mflops",dot_mflops);
    else
      ydoc.get(title)->add("DOT Mflops","inf");

    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
    ydoc.get(title)->add("MATVEC Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVEC Mflops","inf");

#ifdef MINIFE_FUSED
    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVECDOT Mflops","inf");
#endif

#if MINIFE_KERNELS == 0
    ydoc.get(title)->add("Total","");
    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
    if (total_mflops >= 0)
      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
    else
      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
#endif
  }

  return verify_result;
}
Пример #17
0
gint anim_next_frame(struct model_pak *model)
{
    gulong time;
    gchar *text, *name;

    g_assert(model != NULL);

    /* increment and test if should we return to the start */
    model->cur_frame += model->anim_step;
    if (model->cur_frame >= model->num_frames)
        if (model->anim_loop)
            model->cur_frame = 1;

    /* continue until we run out of frames (or a stop is flagged) */
    if (model->cur_frame < model->num_frames && model->animating)
    {
#if DEBUG_DISPLAY_NEXT_FRAME
        printf("displaying [%d]\n", model->cur_frame);
#endif

        time = mytimer();

        /* if a dialog exists - update via the current frame spinner */
        if (dialog_exists(ANIM, model))
            gui_relation_update(model);
        else
        {
            /* otherwise, update manually */
            read_frame(model->afp, model->cur_frame, model);
            meas_graft_model(model);
            gui_active_refresh();
            redraw_canvas(SINGLE);
        }

        /* animation adjusted redraw time */
        time = mytimer() - time;
        model->redraw_cumulative += time;

        /* NEW - render to file */
        if (sysenv.render.animate)
        {
            text = g_strdup_printf("%s_%06d.pov", sysenv.render.animate_file, model->cur_frame);
            name = g_build_filename(sysenv.cwd, text, NULL);

            write_povray(name, model);

            /* NB: added this as jago keeps locking up on multi-frame renders */
            if (!sysenv.render.no_povray_exec)
                povray_exec(name);

            g_free(text);
            g_free(name);
        }

        return(TRUE);
    }

    /* FIXME - find a better way to do this... */
    if (!model->transform_list)
        fclose(model->afp);

    /* done animation */
    model->animating = FALSE;
    model->cur_frame--;

    /* create movie? */
    if (sysenv.render.animate && !sysenv.render.no_povray_exec)
    {
        text = NULL;
        switch (sysenv.render.animate_type)
        {
        case ANIM_GIF:
            text = g_strdup_printf("%s -delay %d %s_*.tga %s.gif",
                                   sysenv.convert_path,
                                   (gint) sysenv.render.delay,
                                   sysenv.render.animate_file, sysenv.render.animate_file);
            break;

        case ANIM_MPEG:
            text = g_strdup_printf("%s -quality %d -delay %d %s_*.tga %s.mpg",
                                   sysenv.convert_path, (gint) sysenv.render.mpeg_quality,
                                   (gint) sysenv.render.delay,
                                   sysenv.render.animate_file, sysenv.render.animate_file);
            break;
        }

        if (text)
        {
            system(text);
            g_free(text);
            gui_text_show(DEFAULT, "Completed movie creation.\n");
        }
    }

    /* cleanup */
    if (sysenv.render.no_keep_tempfiles)
    {
#ifndef __WIN32
        text = g_strdup_printf("rm -rf %s_*.pov", sysenv.render.animate_file);
        system(text);
        g_free(text);
        text = g_strdup_printf("rm -rf %s_*.tga", sysenv.render.animate_file);
        system(text);
        g_free(text);
#endif
        /* TODO - windows equivalents */
    }

    /* done - return FALSE to terminate the timer */
    return(FALSE);
}
Пример #18
0
int main(int argc, char* argv[])
{
    int i, j, loop, num_alive, maxloop;
    int ldboard, ldnbngb;
    double t1, t2;
    double temps;
 
    int *board;
    int *nbngb;

    if (argc < 2) {
		maxloop = 10;
    } 
    else if (argc >= 2){
		maxloop = atoi(argv[1]);
		if(argc > 2)
			BS = atoi(argv[2]);
		if(argc > 3){
			num_threads = atoi(argv[3]); 
    	}
	}
	omp_set_num_threads(num_threads);
    num_alive = 0;

    /* Leading dimension of the board array */
    ldboard = BS + 2;
    /* Leading dimension of the neigbour counters array */
    ldnbngb = BS;

    board = malloc( ldboard * ldboard * sizeof(int) );
    nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) );


    num_alive = generate_initial_board( BS, &(cell(1, 1)), ldboard );
	#ifdef OUTPUT_BOARD
    	output_board( BS, &(cell(1, 1)), ldboard, 0 );
	#endif

    printf("Starting number of living cells = %d\n", num_alive);
    t1 = mytimer();

    for (loop = 1; loop <= maxloop; loop++) {

		cell(   0, 0   ) = cell(BS, BS);
		cell(   0, BS+1) = cell(BS,  1);
		cell(BS+1, 0   ) = cell( 1, BS);
		cell(BS+1, BS+1) = cell( 1,  1);
		

		#pragma omp parallel for
		for (i = 1; i <= BS; i++) {
		    cell(   i,    0) = cell( i, BS);
		    cell(   i, BS+1) = cell( i,  1);
		    cell(   0,    i) = cell(BS,  i);
		    cell(BS+1,    i) = cell( 1,  i);
		}

		#pragma omp parallel for private(i)
		for (j = 1; j <= BS; j++) {
			for (i = 1; i <= BS; i++) {
			ngb( i, j ) =
			    cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
			    cell( i-1, j   ) +                  cell( i+1, j   ) +
			    cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
		    }
		}

		num_alive = 0;
		#pragma omp parallel for private (i) reduction(+:num_alive)
		for (j = 1; j <= BS; j++) {
			for (i = 1; i <= BS; i++) {
				if ( (ngb( i, j ) < 2) || 
				     (ngb( i, j ) > 3) ) {
				    cell(i, j) = 0;
				}
				else {
				    if ((ngb( i, j )) == 3)
					cell(i, j) = 1;
				}
				if (cell(i, j) == 1) {
				    num_alive ++;
				}
		    }
		}
	#ifdef OUTPUT_BOARD
		output_board( BS, &(cell(1, 1)), ldboard, loop);
	#endif
	#ifdef PRINT_ALIVE
		printf("%d \n", num_alive);
	#endif
    }

    t2 = mytimer();
    temps = t2 - t1;
    printf("Final number of living cells = %d\n", num_alive);
    printf("time=%.2lf ms\n",(double)temps * 1.e3);
    #ifdef BENCH
		char fname [40];
		sprintf(fname, "time_omp_%d.dat", num_threads);
    	FILE* f=fopen(fname, "w");
    	if (f != NULL)
    		fprintf(f,"%.2lf", temps*1.e3);
    	fclose(f);
    #endif
    #ifdef OUTPUT_BOARD
    output_board( BS, &(cell(1, 1)), ldboard, maxloop);
    #endif

    free(board);
    free(nbngb);
    return EXIT_SUCCESS;
}
Пример #19
0
int main(int argc, char* argv[]){
  MPI_Init(NULL, NULL);
  int rank, size;
  int loop, num_alive, loop_iterations;
  int ldboard, ldnbngb, ldglobalboard;
  double t1, time, final_time;
  int periods[2] = {1, 1};
  int *globboard= NULL;
  int *globboard2= NULL;
  int *board;
  int *nbngb;

  /* Initialization of MPI */
  MPI_Comm_rank( MPI_COMM_WORLD, &rank );
  MPI_Comm_size( MPI_COMM_WORLD, &size);
  if(argc >= 2){
    if(!strcmp("-h",argv[1])){
      if(!rank)
	helper();
      MPI_Finalize();
      return EXIT_SUCCESS;
    }
  }
  int i, j;
  int process_per_row = sqrt(size);
  int process_per_column = sqrt(size);
  int dims[2] = {process_per_row, process_per_column};
  
  // It only works if the number of process in the input is a perfect square
  if(size != process_per_column*process_per_row){
    fprintf(stderr, "Square Perfect needed as input size.\nExiting Program.");
    MPI_Finalize();
    return EXIT_FAILURE;
  }

  MPI_Comm grid;

  // Initialize cartesian grid
  MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods,0, &grid);
  MPI_Comm_rank(grid, &rank);

  /* User input */
  if (argc < 2) {
    loop_iterations = 10;
    BS = 30;
  } else if (argc >= 2){
    loop_iterations = atoi(argv[1]);
    if(argc > 2)
      BS = atoi(argv[2]);
    else
      BS = 30;
  }
  num_alive = 0;

  /*Leading dimension of global board array*/
  ldglobalboard = BS + 2; // +2 because of upper and above added (+ X +)
  /* Leading dimension of board array */
  ldboard = BS/process_per_row + 2; // +2 because of upper and above added (+ X +)
  /* Leading dimension of neigbour array */
  ldnbngb = BS/sqrt(size); // Same number of element in each process which is equal to this formula

  // Initialization of cells board
  board = (int *)malloc( ldboard * ldboard * sizeof(int) );
  nbngb = (int *)malloc( ldnbngb * ldnbngb * sizeof(int) );

  // Initialization of global cell board (which is common between all processes)
  if(!rank){
    globboard = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int));
    globboard2 = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int));
    num_alive = generate_initial_board( BS, &globboard[1+ldglobalboard] , ldglobalboard );
    output_board( BS, &globboard[1+ldglobalboard], ldglobalboard, 0 );
    fprintf(stderr, "Starting number of living cells = %d\n", num_alive);
  }

  // Matrix block type used by each processes
  MPI_Datatype block2, block;
  MPI_Type_vector(ldboard-2, ldboard-2, ldglobalboard, MPI_INT, &block2);
  MPI_Type_create_resized(block2, 0, sizeof(int), &block);
  MPI_Type_commit(&block);

  // Matrix sub block type used by each processes
  MPI_Datatype sub_block2, sub_block;
  MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &sub_block2);
  MPI_Type_create_resized(sub_block2, 0, sizeof(int), &sub_block);
  MPI_Type_commit(&sub_block);

  int *process_count = (int*)malloc(size*sizeof(int));  
  // number of cells per processes
  int *cell_per_processes = (int*)malloc(size*sizeof(int));

  // Prototyping moves for each processes (preparing matrix's scatter)
  for (i = 0; i < process_per_row; ++i){
    for (j = 0; j < process_per_column; ++j){
      process_count[i+j*process_per_column]= 1;
      cell_per_processes[i+j*process_per_column]= i*ldglobalboard*(ldboard-2)+j*(ldboard-2);
    }
  }

  /* Explodes matrix into sub_blocks elements */
  MPI_Scatterv(&globboard[1+ldglobalboard], process_count, cell_per_processes, block, &board[ldboard+1], 1, sub_block,0, grid);

  // Initialize for each processes, a table of the neighbours.
  int neighbours[8];
  neighbour_table(neighbours, grid, rank);

  /* Time to begin */
  t1 = mytimer();
  int blocksize = ldboard-2;
  MPI_Datatype row_blocks;
  MPI_Type_vector(blocksize, 1, ldboard, MPI_INT, &row_blocks);
  MPI_Type_commit(&row_blocks);

  // status for waiting time...
  MPI_Status mpi_status;

  // Create as much MPI request as number of neighbours possible (in the worst case 8) 
  MPI_Request cart_request[8];
  for (loop = 1; loop <= loop_iterations; ++loop) {
    /* Start communications to send and recv informations from neighbours */
    inter_proc_communications(cart_request, neighbours, grid, blocksize, board, ldboard, row_blocks);

    /* Compute inside process cells */
    for (j = 2; j <= blocksize-1; ++j) {
      for (i = 2; i <= blocksize-1; ++i) {
	ngb( i, j ) =
	  cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) +
	  cell( i-1, j   ) +                  cell( i+1, j   ) +
	  cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 );
      }
    }

    /* Computes cells on the border */

    // Cell neighbour's composition
    
    // 4 2 5       4           4 2 5       4 2 5       4 2 5 //
    // 0 X 1  -->  0      -->  0      -->  0   1  -->  0   1 //
    // 6 3 7       6           6           6   7       6 3 7 //
    
    /* Column on the left needs data from the left process --> 4, 0, 6*/ 
    MPI_Wait(&cart_request[0], &mpi_status);
    MPI_Wait(&cart_request[4], &mpi_status);
    MPI_Wait(&cart_request[6], &mpi_status);
    process_frontier(1, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb);

    /* Line above needs data from the above process --> 2, 5 */
    MPI_Wait(&cart_request[2], &mpi_status);
    MPI_Wait(&cart_request[5], &mpi_status);
    process_frontier(1, blocksize, board, ROW, ldboard, nbngb, ldnbngb);

    /* Column on the right needs data from the right process --> 1, 7 */
    MPI_Wait(&cart_request[1], &mpi_status);
    MPI_Wait(&cart_request[7], &mpi_status);
    process_frontier(blocksize, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb);

    /* Line under needs data from under process --> 3 */
    MPI_Wait(&cart_request[3], &mpi_status);
    process_frontier(blocksize, blocksize, board, ROW, ldboard, nbngb, ldnbngb);


    /* Update the cell */
    num_alive = 0;
    for (j = 1; j <= blocksize; ++j) {
      for (i = 1; i <= blocksize; ++i) {
	if ( (ngb( i, j ) < 2) ||
	     (ngb( i, j ) > 3) ) {
	  cell(i, j) = 0;
	}
	else {
	  if ((ngb( i, j )) == 3)
	    cell(i, j) = 1;
	}
	if (cell(i, j) == 1) {
	  num_alive+=1;
	}
      }
    }
    printf("%d \n", num_alive);
  }

  /* Reassembles matrix into one from the sub blocks in the block */
  MPI_Gatherv(&board[ldboard+1], 1, sub_block, &globboard2[1+ldglobalboard], process_count, cell_per_processes, block, 0, grid);

  /* Reduction to determine max time execution */
  time = mytimer() - t1;
  MPI_Allreduce(&time, &final_time, 1,MPI_DOUBLE, MPI_MAX, grid);
  
  /* Reduction to determine number of cells still alive in all processes */
  MPI_Allreduce(MPI_IN_PLACE, &num_alive, 1, MPI_INT, MPI_SUM, grid);
  
  /* The END */
  if(!rank){
    // Combien de cellules sont en PLS à la fin de la soirée ?
    printf("Final number of living cells = %d\n", num_alive);
    printf("time=%.2lf ms\n",(double)time * 1.e3);
    char str [100];
    // create debug file 
    sprintf(str, "mpi_debug_%d.dat", size);
    FILE *fd = NULL;
    fd=fopen(str, "w");
    // JUST TELL ME IF IT WORKS !!
    if (fd != NULL)
      fprintf(fd,"%.2lf", time*1.e3);
    else
      exit(EXIT_FAILURE);
    fclose(fd);
    output_board( BS, &globboard2[1+ldglobalboard], ldglobalboard, loop_iterations);
  }
  // FREE ALL
  free(process_count);
  free(cell_per_processes);
  free(board);
  free(nbngb);
  MPI_Finalize();
  // The final end
  return EXIT_SUCCESS;
}
Пример #20
0
ibis::table*
ibis::jRange::select(const ibis::table::stringArray& colnames) const {
    ibis::table *res = 0;
    if (nrows < 0) {
        int64_t ierr = count();
        if (ierr < 0) {
            LOGGER(ibis::gVerbose > 0)
                << "Warning -- jRange::count failed with error code"
                << ierr;
            return res;
        }
    }
    if (valr_ == 0 || orderr_ == 0 || vals_ == 0 || orders_ == 0 ||
        orderr_->size() != maskr_.cnt() || orders_->size() != masks_.cnt()) {
        LOGGER(ibis::gVerbose > 0)
            << "Warning -- jRange::select failed to evaluate the join";
        return res;
    }
    if (colnames.empty() || nrows == 0) {
        std::string nm = ibis::util::shortName(desc_);
        res = new ibis::tabula(nm.c_str(), desc_.c_str(), nrows);
        return res;
    }

    const uint32_t ncols = colnames.size();
    std::string evt;
    evt = "select ";
    evt += colnames[0];
    for (uint32_t j = 1; j < ncols; ++ j) {
        evt += ", ";
        evt += colnames[j];
    }
    if ((desc_[0] != 'F' && desc_[0] != 'f') ||
        (desc_[1] != 'R' && desc_[1] != 'r') ||
        (desc_[2] != 'O' && desc_[2] != 'o') ||
        (desc_[3] != 'M' && desc_[3] != 'm'))
        evt += " for ";
    else
        evt += ' ';
    evt += desc_;
    ibis::util::timer mytimer(evt.c_str());
    std::map<const char*, uint32_t, ibis::lessi> namesToPos;
    std::vector<uint32_t> ipToPos(colnames.size());
    std::vector<const ibis::column*> ircol, iscol;
    std::vector<const ibis::dictionary*> cats(colnames.size(), 0);
    // identify the names from the two data partitions
    for (uint32_t j = 0; j < ncols; ++ j) {
        ipToPos[j] = ncols+1;
        const char* cn = colnames[j];
        std::string tname;
        while (*cn != 0 && *cn != '.') {
            tname += *cn;
            ++ cn;
        }
        if (*cn == '.') {
            ++ cn;
        }
        else { // did not find '.'
            tname.erase();
            cn = colnames[j];
        }
        int match = -1; // 0 ==> partr_, 1 ==> parts_
        if (! tname.empty()) {
            match = frm_->position(tname.c_str());
            if (match >= static_cast<long>(frm_->size())) {
                if (stricmp(tname.c_str(), partr_.name()) == 0) {
                    match = 0;
                }
                else if (stricmp(tname.c_str(), parts_.name()) == 0) {
                    match = 1;
                }
            }
        }

        if (match == 0) {
            const ibis::column *col = partr_.getColumn(cn);
            if (col != 0) {
                namesToPos[colnames[j]] = j;
                ipToPos[j] = ircol.size();
                ircol.push_back(col);
                if (col->type() == ibis::CATEGORY) {
                    const ibis::category *cat =
                        static_cast<const ibis::category*>(col);
                    cats[j] = cat->getDictionary();
                }
                else if (col->type() == ibis::UINT) {
                    const ibis::bord::column *bc =
                        dynamic_cast<const ibis::bord::column*>(col);
                    if (bc != 0) {
                        cats[j] = bc->getDictionary();
                    }
                }
            }
            else {
                LOGGER(ibis::gVerbose > 0)
                    << "Warning -- " << evt << " can not find column named \""
                    << colnames[j] << "\" in data partition \"" << partr_.name()
                    << "\"";
                return res;
            }
        }
        else if (match == 1) {
            const ibis::column *col = parts_.getColumn(cn);
            if (col != 0) {
                namesToPos[colnames[j]] = j;
                ipToPos[j] = ncols - iscol.size();
                iscol.push_back(col);
                if (col->type() == ibis::CATEGORY) {
                    const ibis::category *cat =
                        static_cast<const ibis::category*>(col);
                    cats[j] = cat->getDictionary();
                }
                else if (col->type() == ibis::UINT) {
                    const ibis::bord::column *bc =
                        dynamic_cast<const ibis::bord::column*>(col);
                    if (bc != 0) {
                        cats[j] = bc->getDictionary();
                    }
                }
            }
            else {
                LOGGER(ibis::gVerbose > 0)
                    << "Warning -- " << evt << " can not find column named \""
                    << colnames[j] << "\" in data partition \""
                    << parts_.name() << "\"";
                return res;
            }
        }
        else { // not prefixed with a data partition name
            cn = colnames[j];
            const ibis::column* col = partr_.getColumn(cn);
            if (col != 0) {
                ipToPos[j] = ircol.size();
                ircol.push_back(col);
                if (col->type() == ibis::CATEGORY) {
                    const ibis::category *cat =
                        static_cast<const ibis::category*>(col);
                    cats[j] = cat->getDictionary();
                }
                else if (col->type() == ibis::UINT) {
                    const ibis::bord::column *bc =
                        dynamic_cast<const ibis::bord::column*>(col);
                    if (bc != 0) {
                        cats[j] = bc->getDictionary();
                    }
                }
                LOGGER(ibis::gVerbose > 3)
                    << evt << " encountered a column name ("
                    << colnames[j] << ") that does not start with a data "
                    "partition name, assume it is for \"" << partr_.name()
                    << "\"";
            }
            else {
                col = parts_.getColumn(cn);
                if (col != 0) {
                    ipToPos[j] = ncols - iscol.size();
                    iscol.push_back(col);
                    if (col->type() == ibis::CATEGORY) {
                        const ibis::category *cat =
                            static_cast<const ibis::category*>(col);
                        cats[j] = cat->getDictionary();
                    }
                    else if (col->type() == ibis::UINT) {
                        const ibis::bord::column *bc =
                            dynamic_cast<const ibis::bord::column*>(col);
                        if (bc != 0) {
                            cats[j] = bc->getDictionary();
                        }
                    }
                    LOGGER(ibis::gVerbose > 1)
                        << evt << " encountered a column name (" << colnames[j]
                        << ") that does not start with a data partition name, "
                        "assume it is for \"" << parts_.name() << "\"";
                }
                else {
                    LOGGER(ibis::gVerbose > 0)
                        << "Warning -- " << evt << " encountered a name ("
                        << colnames[j] << ") that does not start with a data "
                        "partition name";
                    return res;
                }
            }
        }
    } // for (uint32_t j = 0; j < ncols;

    LOGGER(ibis::gVerbose > 3)
        << evt << " -- found " << ircol.size()
        << " column" << (ircol.size() > 1 ? "s" : "") << " from "
        << partr_.name() << " and " << iscol.size() << " column"
        << (iscol.size() > 1 ? "s" : "") << " from " << parts_.name();

    // change Pos values for columns in S to have offset ircol.size()
    for (uint32_t j = 0; j < ncols; ++j) {
        if (ipToPos[j] <= ncols && ipToPos[j] >= ircol.size())
            ipToPos[j] = (ncols - ipToPos[j]) + ircol.size();
    }
    ibis::table::typeArray   rtypes(ircol.size(), ibis::UNKNOWN_TYPE);
    ibis::table::bufferArray rbuff(ircol.size(), 0);
    IBIS_BLOCK_GUARD(ibis::table::freeBuffers, ibis::util::ref(rbuff),
                     ibis::util::ref(rtypes));
    ibis::table::typeArray   stypes(iscol.size(), ibis::UNKNOWN_TYPE);
    ibis::table::bufferArray sbuff(iscol.size(), 0);
    IBIS_BLOCK_GUARD(ibis::table::freeBuffers, ibis::util::ref(sbuff),
                     ibis::util::ref(stypes));
    bool sane = true;

    // retrieve values from r_
    for (uint32_t j = 0; sane && j < ircol.size(); ++ j) {
        rtypes[j] = ircol[j]->type();
        switch (ircol[j]->type()) {
        case ibis::BYTE:
            rbuff[j] = ircol[j]->selectBytes(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<signed char>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::UBYTE:
            rbuff[j] = ircol[j]->selectUBytes(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<unsigned char>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::SHORT:
            rbuff[j] = ircol[j]->selectShorts(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int16_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::USHORT:
            rbuff[j] = ircol[j]->selectUShorts(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint16_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::INT:
            rbuff[j] = ircol[j]->selectInts(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int32_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::UINT:
            rbuff[j] = ircol[j]->selectUInts(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint32_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::LONG:
            rbuff[j] = ircol[j]->selectLongs(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int64_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::ULONG:
            rbuff[j] = ircol[j]->selectULongs(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint64_t>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::FLOAT:
            rbuff[j] = ircol[j]->selectFloats(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<float>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::DOUBLE:
            rbuff[j] = ircol[j]->selectDoubles(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<double>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        case ibis::TEXT:
        case ibis::CATEGORY:
            rbuff[j] = ircol[j]->selectStrings(maskr_);
            if (rbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<std::vector<std::string>*>(rbuff[j]),
                     *orderr_);
            else
                sane = false;
            break;
        default:
            sane = false;
            rbuff[j] = 0;
            LOGGER(ibis::gVerbose > 1)
                << "Warning -- jRange::select does not support column "
                "type " << ibis::TYPESTRING[(int)ircol[j]->type()]
                << " (name = " << partr_.name() << "." << ircol[j]->name()
                << ")";
            break;
        }
    }
    if (! sane) {
        return res;
    }

    // retrieve values from parts_
    for (uint32_t j = 0; sane && j < iscol.size(); ++ j) {
        stypes[j] = iscol[j]->type();
        switch (iscol[j]->type()) {
        case ibis::BYTE:
            sbuff[j] = iscol[j]->selectBytes(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<signed char>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::UBYTE:
            sbuff[j] = iscol[j]->selectUBytes(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<unsigned char>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::SHORT:
            sbuff[j] = iscol[j]->selectShorts(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int16_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::USHORT:
            sbuff[j] = iscol[j]->selectUShorts(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint16_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::INT:
            sbuff[j] = iscol[j]->selectInts(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int32_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::UINT:
            sbuff[j] = iscol[j]->selectUInts(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint32_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::LONG:
            sbuff[j] = iscol[j]->selectLongs(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<int64_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::ULONG:
            sbuff[j] = iscol[j]->selectULongs(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<uint64_t>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::FLOAT:
            sbuff[j] = iscol[j]->selectFloats(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<float>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::DOUBLE:
            sbuff[j] = iscol[j]->selectDoubles(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<array_t<double>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        case ibis::TEXT:
        case ibis::CATEGORY:
            sbuff[j] = iscol[j]->selectStrings(masks_);
            if (sbuff[j] != 0)
                ibis::util::reorder
                    (*static_cast<std::vector<std::string>*>(sbuff[j]),
                     *orders_);
            else
                sane = false;
            break;
        default:
            sane = false;
            sbuff[j] = 0;
            LOGGER(ibis::gVerbose > 1)
                << "Warning -- jRange::select does not support column "
                "type " << ibis::TYPESTRING[(int)iscol[j]->type()]
                << " (name = " << parts_.name() << "." << iscol[j]->name()
                << ")";
            break;
        }
    }
    if (! sane) {
        return res;
    }

    /// fill the in-memory buffer
    switch (colr_.type()) {
    case ibis::BYTE:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<signed char>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<signed char>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::UBYTE:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<unsigned char>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<unsigned char>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::SHORT:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<int16_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<int16_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::USHORT:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<uint16_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<uint16_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::INT:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<int32_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<int32_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::UINT:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<uint32_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<uint32_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::LONG:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<int64_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<int64_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::ULONG:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<uint64_t>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<uint64_t>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::FLOAT:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<float>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<float>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    case ibis::DOUBLE:
        res = fillResult
            (nrows, delta1_, delta2_, evt,
             *static_cast<array_t<double>*>(valr_), rtypes, rbuff,
             *static_cast<array_t<double>*>(vals_), stypes, sbuff,
             colnames, ipToPos);
        break;
    default:
        LOGGER(ibis::gVerbose > 0)
            << "Warning -- " << evt << " can not handle join column of type "
            << ibis::TYPESTRING[(int)colr_.type()];
    }

    for (unsigned j = 0; j < cats.size(); ++ j) {
        if (cats[j] != 0) {
            ibis::bord::column *bc = dynamic_cast<ibis::bord::column*>
                (static_cast<ibis::bord*>(res)->getColumn(j));
            if (bc != 0)
                bc->setDictionary(cats[j]);
        }
    }
    return res;
} // ibis::jRange::select
Пример #21
0
void
cg_solve(OperatorType& A,
         const VectorType& b,
         VectorType& x,
         Matvec matvec,
         typename OperatorType::LocalOrdinalType max_iter,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
         typename OperatorType::LocalOrdinalType& num_iters,
         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
         timer_type* my_cg_times)
{
  typedef typename OperatorType::ScalarType ScalarType;
  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;

  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
  timer_type total_time = mytimer();

  int myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (!A.has_local_indices) {
    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
       << std::endl;
    return;
  }

  size_t nrows = A.rows.size();
  LocalOrdinalType ncols = A.num_cols;

  nvtxRangeId_t r1=nvtxRangeStartA("Allocation of Temporary Vectors");
  VectorType r(b.startIndex, nrows);
  VectorType p(0, ncols);
  VectorType Ap(b.startIndex, nrows);
  nvtxRangeEnd(r1);

#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostRegister(&p.coefs[0],ncols*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostRegister(&A.send_buffer[0],A.send_buffer.size()*sizeof(typename VectorType::ScalarType),0);
  cudaCheckError();
#endif
#endif

  normr = 0;
  magnitude_type rtrans = 0;
  magnitude_type oldrtrans = 0;

  LocalOrdinalType print_freq = max_iter/10;
  if (print_freq>50) print_freq = 50;
  if (print_freq<1)  print_freq = 1;

  ScalarType one = 1.0;
  ScalarType zero = 0.0;

  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);

  TICK();
  matvec(A, p, Ap);
  TOCK(tMATVEC);

  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);

  TICK(); rtrans = dot(r, r); TOCK(tDOT);

  normr = std::sqrt(rtrans);

  if (myproc == 0) {
    std::cout << "Initial Residual = "<< normr << std::endl;
  }

  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();

#ifdef MINIFE_DEBUG
  std::ostream& os = outstream();
  os << "brkdown_tol = " << brkdown_tol << std::endl;
#endif

  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
    if (k == 1) {
      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
    }
    else {
      oldrtrans = rtrans;
      TICK(); rtrans = dot(r, r); TOCK(tDOT);
      magnitude_type beta = rtrans/oldrtrans;
      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
    }

    normr = std::sqrt(rtrans);

    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
    }

    magnitude_type alpha = 0;
    magnitude_type p_ap_dot = 0;

    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);

    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);

#ifdef MINIFE_DEBUG
    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
    os.flush();
#endif
    //TODO remove false below
    if (false && p_ap_dot < brkdown_tol) {
      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
#ifdef MINIFE_DEBUG
        os << "ERROR, numerical breakdown!"<<std::endl;
#endif
        //update the timers before jumping out.
        my_cg_times[WAXPY] = tWAXPY;
        my_cg_times[DOT] = tDOT;
        my_cg_times[MATVEC] = tMATVEC;
        my_cg_times[TOTAL] = mytimer() - total_time;
        return;
      }
      else brkdown_tol = 0.1 * p_ap_dot;
    }
    alpha = rtrans/p_ap_dot;
#ifdef MINIFE_DEBUG
    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
#endif

    TICK(); waxpby(one, x, alpha, p, x);
            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
    num_iters = k;
  }
  
#ifdef HAVE_MPI
#ifndef GPUDIRECT
  //TODO move outside?
  cudaHostUnregister(&p.coefs[0]);
  cudaCheckError();
  if(A.send_buffer.size()>0) cudaHostUnregister(&A.send_buffer[0]);
  cudaCheckError();
#endif
#endif

  my_cg_times[WAXPY] = tWAXPY;
  my_cg_times[DOT] = tDOT;
  my_cg_times[MATVEC] = tMATVEC;
  my_cg_times[MATVECDOT] = tMATVECDOT;
  my_cg_times[TOTAL] = mytimer() - total_time;
}