void sparselu_seq_call(float **BENCH)
{
  int ii;
  int jj;
  int kk;
  for (kk = 0; kk < bots_arg_size; kk++) {
    lu0((BENCH[(kk * bots_arg_size) + kk]));
    for (jj = (kk + 1); jj < bots_arg_size; jj++) 
      if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) {
        fwd((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj]));
      }
    for (ii = (kk + 1); ii < bots_arg_size; ii++) 
      if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) {
        bdiv((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(ii * bots_arg_size) + kk]));
      }
    for (ii = (kk + 1); ii < bots_arg_size; ii++) 
      if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) 
        for (jj = (kk + 1); jj < bots_arg_size; jj++) 
          if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) {
            if ((BENCH[(ii * bots_arg_size) + jj]) == ((0L))) 
              BENCH[(ii * bots_arg_size) + jj] = allocate_clean_block();
            bmod((BENCH[(ii * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj]),(BENCH[(ii * bots_arg_size) + jj]));
          }
  }
}
Exemple #2
0
void sparselu_seq_call(float **BENCH)
{
   int ii, jj, kk;

   for (kk=0; kk<bots_arg_size; kk++)
   {
      lu0(BENCH[kk*bots_arg_size+kk]);
      for (jj=kk+1; jj<bots_arg_size; jj++)
         if (BENCH[kk*bots_arg_size+jj] != NULL)
         {
            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
         }
      for (ii=kk+1; ii<bots_arg_size; ii++) 
         if (BENCH[ii*bots_arg_size+kk] != NULL)
         {
            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
         }
      for (ii=kk+1; ii<bots_arg_size; ii++)
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            for (jj=kk+1; jj<bots_arg_size; jj++)
               if (BENCH[kk*bots_arg_size+jj] != NULL)
               {
                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
               }

   }
}
Exemple #3
0
void lu_dependencies( double* M[NB][NB] )
{
    float t_start,t_end;
    float time;
    t_start=mysecond();

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
        {
            double *diag = M[kk][kk];
#pragma omp task depend(inout: [BSIZE][BSIZE]diag)
            lu0(diag);
        }
        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL) {
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]col)
                fwd(diag, col);
            }
            
        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]row)
                    bdiv (diag, row);
                }

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            M[ii][jj]=allocate_clean_block();
                        {
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]row, [BSIZE][BSIZE]col) depend(inout: [BSIZE][BSIZE]inner)
                            bmod(row, col, inner);
                        }    
                    }
                }
            }
        }
    }

#pragma omp taskwait

    t_end=mysecond();
    time = t_end-t_start;
    printf("Dependencies time to compute = %f usec\n", time);
}
static void OUT__1__1527__(void *__out_argv)
{
  float ***BENCH = (float ***)(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::BENCH_p);
  int ii = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::ii);
  int jj = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::jj);
  int kk = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::kk);
  int _p_ii = ii;
  int _p_jj = jj;
  int _p_kk = kk;
  if ((( *BENCH)[(_p_ii * bots_arg_size) + _p_jj]) == ((0L))) 
    ( *BENCH)[(_p_ii * bots_arg_size) + _p_jj] = allocate_clean_block();
  bmod((( *BENCH)[(_p_ii * bots_arg_size) + _p_kk]),(( *BENCH)[(_p_kk * bots_arg_size) + _p_jj]),(( *BENCH)[(_p_ii * bots_arg_size) + _p_jj]));
}
void rec_lobatto( Teuchos::LAPACK<int,Real> &lapack,
                  const double xl1, 
                  const double xl2,
                  ROL::Vector<Real> &a,
                  ROL::Vector<Real> &b ) {

    Teuchos::RCP<std::vector<Real> > ap = 
        Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(a)).getVector()); 
    Teuchos::RCP<std::vector<Real> > bp = 
        Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(b)).getVector()); 


    const int N = ap->size()-1;

    Teuchos::RCP<std::vector<Real> > amodp = Teuchos::rcp(new std::vector<Real> (N,0.0));
    Teuchos::RCP<std::vector<Real> > bmodp = Teuchos::rcp(new std::vector<Real> (N-1,0.0));
    Teuchos::RCP<std::vector<Real> > enp   = Teuchos::rcp(new std::vector<Real> (N,0.0));
    Teuchos::RCP<std::vector<Real> > gp    = Teuchos::rcp(new std::vector<Real> (N,0.0));

    // Nth canonical vector
    (*enp)[N-1] = 1.0;

    for(int i=0;i<N-1;++i) {
        (*bmodp)[i] = sqrt((*bp)[i+1]);
    }

    for(int i=0;i<N;++i) {
        (*amodp)[i] = (*ap)[i]-xl1;
    }
    
    ROL::StdVector<Real> amod(amodp);  
    ROL::StdVector<Real> bmod(bmodp);  
    ROL::StdVector<Real> en(enp);  
    ROL::StdVector<Real> g(gp);  

    trisolve(lapack,bmod,amod,bmod,en,g);         
    Real g1 = (*gp)[N-1];

    for(int i=0;i<N;++i) {
        (*amodp)[i] = (*ap)[i]-xl2;
    }
    

    trisolve(lapack,bmod,amod,bmod,en,g);         
    Real g2 = (*gp)[N-1];

    (*ap)[N] = (g1*xl2-g2*xl1)/(g1-g2);
    (*bp)[N] = (xl2-xl1)/(g1-g2);
    
}
Exemple #6
0
void lu_serial( double* M[NB][NB] )
{
    float t_start,t_end;
    float time;
    t_start= mysecond();

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
        {
            double *diag = M[kk][kk];
            lu0(diag);
        }

        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL)
            {
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
                fwd(diag, col);
            }

        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
                    bdiv (diag, row);
                }

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            M[ii][jj]=allocate_clean_block();
                        {
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
                            bmod(row, col, inner);
                        }
                    }
                }
            }
        }
    }

    t_end=mysecond();

    time = t_end-t_start;
    printf("Serial time to compute = %f usec\n", time);
}
Exemple #7
0
void sparselu_par_call(float **BENCH)
{
   int ii, jj, kk;
   
   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
           bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1);
#pragma omp parallel private(kk)
   {
   for (kk=0; kk<bots_arg_size; kk++) 
   {
#pragma omp single
      lu0(BENCH[kk*bots_arg_size+kk]);

#pragma omp for nowait
      for (jj=kk+1; jj<bots_arg_size; jj++)
         if (BENCH[kk*bots_arg_size+jj] != NULL)
            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
         {
            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
         }
#pragma omp for
      for (ii=kk+1; ii<bots_arg_size; ii++) 
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
         {
            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
         }

#pragma omp for private(jj)
      for (ii=kk+1; ii<bots_arg_size; ii++)
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            for (jj=kk+1; jj<bots_arg_size; jj++)
               if (BENCH[kk*bots_arg_size+jj] != NULL)
               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
               {
                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
               }

   }
   }
   bots_message(" completed!\n");
}
void sparselu_par_call(float **BENCH, int matrix_size, int submatrix_size)
{
    int ii, jj, kk;

#pragma omp parallel private(kk,ii,jj) shared(BENCH)
#pragma omp single /* nowait */
    {
        /*#pragma omp task untied*/
        for (kk=0; kk<matrix_size; kk++)
        {
#pragma omp task firstprivate(kk) shared(BENCH) depend(inout: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size])
            lu0(BENCH[kk*matrix_size+kk], submatrix_size);
            for (jj=kk+1; jj<matrix_size; jj++)
                if (BENCH[kk*matrix_size+jj] != NULL)
                {
#pragma omp task firstprivate(kk, jj) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size])
                    fwd(BENCH[kk*matrix_size+kk], BENCH[kk*matrix_size+jj], submatrix_size);
                }
            for (ii=kk+1; ii<matrix_size; ii++)
                if (BENCH[ii*matrix_size+kk] != NULL)
                {
#pragma omp task firstprivate(kk, ii) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size])
                    bdiv (BENCH[kk*matrix_size+kk], BENCH[ii*matrix_size+kk], submatrix_size);
                }

            for (ii=kk+1; ii<matrix_size; ii++)
                if (BENCH[ii*matrix_size+kk] != NULL)
                    for (jj=kk+1; jj<matrix_size; jj++)
                        if (BENCH[kk*matrix_size+jj] != NULL)
                        {
                            if (BENCH[ii*matrix_size+jj]==NULL) BENCH[ii*matrix_size+jj] = allocate_clean_block(submatrix_size);
#pragma omp task firstprivate(kk, jj, ii) shared(BENCH) \
                            depend(in: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size], BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size]) \
                            depend(inout: BENCH[ii*matrix_size+jj:submatrix_size*submatrix_size])
                            bmod(BENCH[ii*matrix_size+kk], BENCH[kk*matrix_size+jj], BENCH[ii*matrix_size+jj], submatrix_size);
                        }

        }
#pragma omp taskwait
    }
}
Exemple #9
0
void lu(int n, int bs, int me)
{
    int i, il, j, jl, k, kl;
    int I, J, K;
    double *A, *B, *C, *D;
    int dimI, dimJ, dimK;
    int strI, strJ, strK;
    unsigned int t1, t2, t3, t4, t11, t22;
    int diagowner;
    double *buf1, *buf2;

    /* temporary memories */
    buf1 = (double *)malloc(block_size*block_size*sizeof(double));
    buf2 = (double *)malloc(block_size*block_size*sizeof(double));

    for (k=0, K=0; k<n; k+=bs, K++) {
        kl = k + bs;
        if (kl > n) {
            kl = n;
            strK = kl - k;
        } else {
            strK = bs;
        }

        /* factor diagonal block */
        diagowner = block_owner(K, K);
        if (diagowner == me) {
            A = a[K+K*nblocks];
            lu0(A, strK, strK);
        }
        MP_BARRIER();

        /* divide column k by diagonal block */
        if(block_owner(K, K) == me)
            D = a[K+K*nblocks];
        else {
            D = buf1;
            get_remote(D, K, K);
        }
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            if (block_owner(I, K) == me) {  /* parcel out blocks */
                il = i + bs;
                if (il > n) {
                    il = n;
                    strI = il - i;
                } else {
                    strI = bs;
                }
                A = a[I+K*nblocks];
                bdiv(A, D, strI, strK, strI, strK);
            }
        }

        /* modify row k by diagonal block */
        for (j=kl, J=K+1; j<n; j+=bs, J++) {
            if (block_owner(K, J) == me) {  /* parcel out blocks */
                jl = j+bs;
                if (jl > n) {
                    jl = n;
                    strJ = jl - j;
                } else {
                    strJ = bs;
                }
                A = a[K+J*nblocks];
                bmodd(D, A, strK, strJ, strK, strK);
            }
        }

        MP_BARRIER();

        /* modify subsequent block columns */
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            il = i+bs;
            if (il > n) {
                il = n;
                strI = il - i;
            } else {
                strI = bs;
            }

            if(block_owner(I,K) == me)
                A = a[I+K*nblocks];
            else {
                A = buf1;
                get_remote(A, I, K);
            }
            for (j=kl, J=K+1; j<n; j+=bs, J++) {
                jl = j + bs;
                if (jl > n) {
                    jl = n;
                    strJ= jl - j;
                } else {
                    strJ = bs;
                }
                if (block_owner(I, J) == me) {  /* parcel out blocks */
                    if(block_owner(K,J) == me)
                        B = a[K+J*nblocks];
                    else {
                        B = buf2;
                        get_remote(B, K, J);
                    }
                    C = a[I+J*nblocks];
                    bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
                }
            }
        }
    }

    free(buf1);
    free(buf2);
}
Exemple #10
0
void slave() {

  double *b;
  double *buffer;
  double *workbuf;
  int i,j,k;
  int myrow,nextrow,rownum;
  MPI_Status status;
  int ntasks,pid;

  /* get the number of the processes in application. 
 *  *      Can we define ntasks as shared variable?*/
  MPI_Comm_size(MPI_COMM_WORLD,&ntasks); 
  rownum=matrix_size/(block_size*ntasks);
  
  /* allocate the local portion of matrix */
  b=(double*)malloc(rownum*block_size*matrix_size);

  /* allocate buffer space, it should be big enough 
 *  *      to contaion a whole row of block. */
  buffer=(double *)malloc(matrix_size*block_size*sizeof(double));

  /* receive the initial matrix from process 0 */
  for (i=0;i<rownum;i++)
    MPI_Recv(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE,
	     0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD,&status);

  MPI_Barrier(MPI_COMM_WORLD);

  /* do computation work of this process */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the process that owns the row i 
 *  *        to row i+block_size-1 */
    pid=(i/block_size)%ntasks;
    myrow=((i/block_size)/ntasks)*block_size;

    if (pid==myrank) { /* My process */
     
      /* factor diagonal */
      lu0(&b[myrow+myrow*matrix_size],block_size,matrix_size);

      /* modify "column" by diagonal */
      for (j=myrow+block_size;j<matrix_size;j+=block_size)
	bdiv(&b[j+myrow*matrix_size],&b[myrow+myrow*matrix_size],
	     block_size,matrix_size);

      /* send this row to other processes, only need to send the column 
 *  * 	 after diagonal? */
      for (j=0;j<ntasks;j++) {
	if (j!=myrank)
	  MPI_Send(&b[myrow*matrix_size],block_size*matrix_size,MPI_DOUBLE,
		   j,i,MPI_COMM_WORLD);
      }
      workbuf=&b[myrow*matrix_size];
    }
    else { /* other process */
      /* receive row i to row i+block_size-1 from process pid */
      MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i,
	       MPI_COMM_WORLD,&status);
      workbuf=buffer;
    }
    
    if (myrank>pid)
      nextrow=myrow;
    else
      nextrow=myrow+block_size;

    /* modify the "row" using diagonal */
    for (j=nextrow;j<matrix_size;j+=block_size) 
      bmodd(&b[i+j*matrix_size],&workbuf[i],
	    block_size,matrix_size);
     
    /* modify the internal rows and columns */
    for (j=nextrow;j<matrix_size;j+=block_size)
      for (k=i+block_size;k<matrix_size;k+=block_size) 
	bmod(&b[k+j*matrix_size],&workbuf[k],&b[i+j*matrix_size],
	     block_size,matrix_size);
  }

  MPI_Barrier(MPI_COMM_WORLD);

  /* Send b to process 0. */
  for (i=0;i<rownum;i++)
    MPI_Send(&b[i*block_size*matrix_size],block_size*matrix_size,MPI_DOUBLE,
	     0,i*block_size*ntasks+myrank*block_size,MPI_COMM_WORLD);
}
Exemple #11
0
void master() {
  
  double *a;
  double *rhs;
  int ntasks,pid;
  double *buffer;
  double *workbuf;
  int i,j,k;
  MPI_Status status;
  time_t t0,t1;
  int  ct;
 
  /* get the number of the processes in application. */
  MPI_Comm_size(MPI_COMM_WORLD,&ntasks); 

  /* allocate matrix, rhs vector */
  a = (double *) malloc( matrix_size*matrix_size*sizeof(double) ) ;
  rhs = (double *) malloc( matrix_size*sizeof(double) ) ;

  /* initialize the matrix */
  /* Do we need to allocate a matrix a or ony a row 
 *  *      and initilize each row and then send to corresponding process? */
  initializeMatrix( matrix_size, a, rhs );

  /* Send each row to the corresponding process. */
  for (i=block_size;i<matrix_size;i+=block_size) {

    /* send i row to i+block_size-1 row to process 
 *  *        (i mod block_size)%ntasks. */
    pid=(i/block_size)%ntasks;

    if (pid!=0)
      MPI_Send(&a[i*matrix_size],matrix_size*block_size,MPI_DOUBLE,
	       pid,i,MPI_COMM_WORLD);
  }

  MPI_Barrier(MPI_COMM_WORLD);

  /* allocate buffer space, it should be big enough 
 *  *      to contaion a whole row of block. */
  buffer=(double *)malloc(matrix_size*block_size*sizeof(double));
  
  time(&t0);

  /* Do the computation work of process 0 */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the processor that own the row i 
 *  *        to row i+block_size-1 */
    pid=(i/block_size)%ntasks;

    if (pid==0) { /* matser process. Me! */
     
      /* factor diagonal */
      lu0(&a[i+i*matrix_size],block_size,matrix_size);

      /* modify "column" by diagonal */
      for (j=i+block_size;j<matrix_size;j+=block_size)
	bdiv(&a[j+i*matrix_size],&a[i+i*matrix_size],
	     block_size,matrix_size);

      /* send this row to other processes, only need to send 
 *  * 	 the column after diagonal? */
      for (j=1;j<ntasks;j++) {
	MPI_Send(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE,
		 j,i,MPI_COMM_WORLD);
      }
      workbuf=&a[i*matrix_size];
    }
    else { /* other process */
      /* receive row i to row i+block_size-1 from process pid */
      MPI_Recv(&buffer,block_size*matrix_size,MPI_DOUBLE,pid,i,
	       MPI_COMM_WORLD,&status);
      workbuf=buffer;
    }

    /* modify the "row" using diagonal */
    for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) 
      bmodd(&a[i+j*matrix_size],&workbuf[i],
	    block_size,matrix_size);

    /* modify the internal rows and columns */
    for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks)
      for (k=i+block_size;k<matrix_size;k+=block_size) 
	bmod(&a[k+j*matrix_size],&workbuf[k],&a[i+j*matrix_size],
	     block_size,matrix_size);

  }

  MPI_Barrier(MPI_COMM_WORLD);

  time(&t1);
  ct=t1-t0;
  printf("LU decomposition took %d millisecs\n", ct);

  /* Receive the modified matrix from all other processes. */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the processor that own the row i 
 *  *        to row i+block_size-1 */
    pid=(i/block_size)%ntasks;

    if (pid!=0)
      MPI_Recv(&a[i*matrix_size],block_size*matrix_size,MPI_DOUBLE,pid,i,MPI_COMM_WORLD,&status);
  }
  
  /* test the resulting decoposition */
  checkResult(matrix_size,a,rhs);
  
}
Exemple #12
0
void lu(int n, int bs, int me)
{
  int i, il, j, jl, k, kl;
  int I, J, K;
  double *A, *B, *C, *D;
  int dimI, dimJ, dimK;
  int strI, strJ, strK;
  unsigned int t1, t2, t3, t4, t11, t22;
  int diagowner, destp, hc, m;
  double *dbuf;
  armci_hdl_t handle[2*MAXPROC];
  int saved[MAXPROC];  
  
  dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double));

  for (k=0, K=0; k<n; k+=bs, K++) {
    kl = k + bs; 
    if (kl > n) {
      kl = n;
      strK = kl - k;
    } else {
      strK = bs;
    }
    
    /* factor diagonal block */
    diagowner = block_owner(K, K);
    if (diagowner == me) {
      A = a[K+K*nblocks]; 
      lu0(A, strK, strK); /* impl algo on this diag block */
    }
    MP_BARRIER(); 
    
    /* divide column k by diagonal block */
    if(block_owner(K, K) == me)
      D = a[K+K*nblocks];
    else {
      D = dbuf;
      get_remote(D, K, K);
    }
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      if (block_owner(I, K) == me) {  /* parcel out blocks */
	il = i + bs; 
	if (il > n) {
	  il = n;
	  strI = il - i;
	} else {
	  strI = bs;
	}
	A = a[I+K*nblocks]; 
	bdiv(A, D, strI, strK, strI, strK);
	
	/* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/
	memset (saved, 0, sizeof(saved));
	for (m = K+1; m < nblocks; m++) {
	    destp = block_owner (I, m);
	    if (destp != me && !saved[destp]) {
	      ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL);
	      saved[destp] = 1;
	    }
	}
      }
    } /* end of for (i=k1, I=K+1...) */
    
    /* modify row k by diagonal block */
    for (j=kl, J=K+1; j<n; j+=bs, J++) {
      if (block_owner(K, J) == me) {  /* parcel out blocks */
	jl = j+bs; 
	if (jl > n) {
	  jl = n;
	  strJ = jl - j;
	} else {
	  strJ = bs;
	}
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
     
	/* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/
        memset (saved, 0, sizeof(saved));
        for (m = K+1; m < nblocks; m++) {
	  destp = block_owner (m, J);
	  if (destp != me  && !saved[destp]) {
	    ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL);
	    saved[destp] = 1;
	  }
	}
      }      
    }
        
    ARMCI_WaitAll();
    ARMCI_AllFence();
    MP_BARRIER();
    /* modify subsequent block columns */
    
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      il = i+bs; 
      if (il > n) {
	il = n;
	strI = il - i;
      } else {
	strI = bs;
      }

      for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs; 
	if (jl > n) {
	  jl = n;
	  strJ= jl - j;
	} else {
	  strJ = bs;
	  }
	if (block_owner(I, J) == me) {  /* parcel out blocks */
	  if(block_owner(I,K) == me)
	    A = a[I+K*nblocks];
	  else {
	    A = bufc[me*nblocks+I];
          }
	  
	  if(block_owner(K,J) == me)
	    B = a[K+J*nblocks];
	  else
	    B = bufr[me*nblocks + J];
	    
	  C = a[I+J*nblocks];
	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
	}
      }
    }
  }
  ARMCI_Free_local(dbuf);
}
Exemple #13
0
void *lu(void *lu_arg)
{
    int n, bs, th_idx;
    int i, il, j, jl, k, kl;
    int I, J, K;
    double *A, *B, *C, *D;
    int dimI, dimJ, dimK;
    int strI, strJ, strK;
    unsigned int t1, t2, t3, t4, t11, t22;
    int diagowner;
    double *buf1, *buf2;

    n = ((int *)lu_arg)[0];
    bs = ((int *)lu_arg)[1];
    th_idx = ((int *)lu_arg)[2];

#ifdef DEBUG
    printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout);
#endif

    /* temporary memories */
    buf1 = (double *)malloc(block_size*block_size*sizeof(double));
    buf2 = (double *)malloc(block_size*block_size*sizeof(double));

    for (k=0, K=0; k<n; k+=bs, K++) {
        kl = k + bs; 
        if (kl > n) {
            kl = n;
            strK = kl - k;
        } else {
            strK = bs;
        }

        /* factor diagonal block */
        diagowner = block_owner(K, K);
        if (diagowner == me_th[th_idx]) {
            A = a[K+K*nblocks];
            print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks);
            lu0(A, strK, strK);
        }
        MT_BARRIER();

        /* divide column k by diagonal block */
        if(block_owner(K, K) == me_th[th_idx])
            D = a[K+K*nblocks];
        else {
            D = buf1;
            get_remote(D, K, K);
        }
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            if (block_owner(I, K) == me_th[th_idx]) {  /* parcel out blocks */
                il = i + bs; 
                if (il > n) {
                    il = n;
                    strI = il - i;
                } else {
                    strI = bs;
                }
                A = a[I+K*nblocks]; 
                bdiv(A, D, strI, strK, strI, strK);
            }
        }

        /* modify row k by diagonal block */
        for (j=kl, J=K+1; j<n; j+=bs, J++) {
            if (block_owner(K, J) == me_th[th_idx]) {  /* parcel out blocks */
                jl = j+bs; 
                if (jl > n) {
                    jl = n;
                    strJ = jl - j;
                } else {
                    strJ = bs;
                }
                A = a[K+J*nblocks];
                bmodd(D, A, strK, strJ, strK, strK);
            }
        }
        MT_BARRIER();

        /* modify subsequent block columns */
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            il = i+bs; 
            if (il > n) {
                il = n;
                strI = il - i;
            } else {
                strI = bs;
            }

            if(block_owner(I,K) == me_th[th_idx])
                A = a[I+K*nblocks];
            else {
                A = buf1;
                get_remote(A, I, K);
            }
            for (j=kl, J=K+1; j<n; j+=bs, J++) {
                jl = j + bs; 
                if (jl > n) {
                    jl = n;
                    strJ= jl - j;
                } else {
                    strJ = bs;
                }
                if (block_owner(I, J) == me_th[th_idx]) {  /* parcel out blocks */
                    if(block_owner(K,J) == me_th[th_idx])
                        B = a[K+J*nblocks];
                    else {
                        B = buf2;
                        get_remote(B, K, J);
                    }
                    C = a[I+J*nblocks];
                    bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
                }
            }
        }
    }

    free(buf1);
    free(buf2);

    return lu_arg;
}