void sparselu_seq_call(float **BENCH)
  int ii;
  int jj;
  int kk;
  for (kk = 0; kk < bots_arg_size; kk++) {
    lu0((BENCH[(kk * bots_arg_size) + kk]));
    for (jj = (kk + 1); jj < bots_arg_size; jj++) 
      if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) {
        fwd((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj]));
    for (ii = (kk + 1); ii < bots_arg_size; ii++) 
      if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) {
        bdiv((BENCH[(kk * bots_arg_size) + kk]),(BENCH[(ii * bots_arg_size) + kk]));
    for (ii = (kk + 1); ii < bots_arg_size; ii++) 
      if ((BENCH[(ii * bots_arg_size) + kk]) != ((0L))) 
        for (jj = (kk + 1); jj < bots_arg_size; jj++) 
          if ((BENCH[(kk * bots_arg_size) + jj]) != ((0L))) {
            if ((BENCH[(ii * bots_arg_size) + jj]) == ((0L))) 
              BENCH[(ii * bots_arg_size) + jj] = allocate_clean_block();
            bmod((BENCH[(ii * bots_arg_size) + kk]),(BENCH[(kk * bots_arg_size) + jj]),(BENCH[(ii * bots_arg_size) + jj]));
Exemple #2
void sparselu_seq_call(float **BENCH)
   int ii, jj, kk;

   for (kk=0; kk<bots_arg_size; kk++)
      for (jj=kk+1; jj<bots_arg_size; jj++)
         if (BENCH[kk*bots_arg_size+jj] != NULL)
            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
      for (ii=kk+1; ii<bots_arg_size; ii++) 
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
      for (ii=kk+1; ii<bots_arg_size; ii++)
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            for (jj=kk+1; jj<bots_arg_size; jj++)
               if (BENCH[kk*bots_arg_size+jj] != NULL)
                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);

Exemple #3
void lu_dependencies( double* M[NB][NB] )
    float t_start,t_end;
    float time;

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
            double *diag = M[kk][kk];
#pragma omp task depend(inout: [BSIZE][BSIZE]diag)
        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL) {
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]col)
                fwd(diag, col);
        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]row)
                    bdiv (diag, row);

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]row, [BSIZE][BSIZE]col) depend(inout: [BSIZE][BSIZE]inner)
                            bmod(row, col, inner);

#pragma omp taskwait

    time = t_end-t_start;
    printf("Dependencies time to compute = %f usec\n", time);
static void OUT__1__1527__(void *__out_argv)
  float ***BENCH = (float ***)(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::BENCH_p);
  int ii = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::ii);
  int jj = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::jj);
  int kk = (int )(((struct OUT__1__1527___data *)__out_argv) -> OUT__1__1527___data::kk);
  int _p_ii = ii;
  int _p_jj = jj;
  int _p_kk = kk;
  if ((( *BENCH)[(_p_ii * bots_arg_size) + _p_jj]) == ((0L))) 
    ( *BENCH)[(_p_ii * bots_arg_size) + _p_jj] = allocate_clean_block();
  bmod((( *BENCH)[(_p_ii * bots_arg_size) + _p_kk]),(( *BENCH)[(_p_kk * bots_arg_size) + _p_jj]),(( *BENCH)[(_p_ii * bots_arg_size) + _p_jj]));
void rec_lobatto( Teuchos::LAPACK<int,Real> &lapack,
                  const double xl1, 
                  const double xl2,
                  ROL::Vector<Real> &a,
                  ROL::Vector<Real> &b ) {

    Teuchos::RCP<std::vector<Real> > ap = 
        Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(a)).getVector()); 
    Teuchos::RCP<std::vector<Real> > bp = 
        Teuchos::rcp_const_cast<std::vector<Real> >((Teuchos::dyn_cast<ROL::StdVector<Real> >(b)).getVector()); 

    const int N = ap->size()-1;

    Teuchos::RCP<std::vector<Real> > amodp = Teuchos::rcp(new std::vector<Real> (N,0.0));
    Teuchos::RCP<std::vector<Real> > bmodp = Teuchos::rcp(new std::vector<Real> (N-1,0.0));
    Teuchos::RCP<std::vector<Real> > enp   = Teuchos::rcp(new std::vector<Real> (N,0.0));
    Teuchos::RCP<std::vector<Real> > gp    = Teuchos::rcp(new std::vector<Real> (N,0.0));

    // Nth canonical vector
    (*enp)[N-1] = 1.0;

    for(int i=0;i<N-1;++i) {
        (*bmodp)[i] = sqrt((*bp)[i+1]);

    for(int i=0;i<N;++i) {
        (*amodp)[i] = (*ap)[i]-xl1;
    ROL::StdVector<Real> amod(amodp);  
    ROL::StdVector<Real> bmod(bmodp);  
    ROL::StdVector<Real> en(enp);  
    ROL::StdVector<Real> g(gp);  

    Real g1 = (*gp)[N-1];

    for(int i=0;i<N;++i) {
        (*amodp)[i] = (*ap)[i]-xl2;

    Real g2 = (*gp)[N-1];

    (*ap)[N] = (g1*xl2-g2*xl1)/(g1-g2);
    (*bp)[N] = (xl2-xl1)/(g1-g2);
Exemple #6
void lu_serial( double* M[NB][NB] )
    float t_start,t_end;
    float time;
    t_start= mysecond();

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
            double *diag = M[kk][kk];

        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL)
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
                fwd(diag, col);

        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
                    bdiv (diag, row);

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
                            bmod(row, col, inner);


    time = t_end-t_start;
    printf("Serial time to compute = %f usec\n", time);
Exemple #7
void sparselu_par_call(float **BENCH)
   int ii, jj, kk;
   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
#pragma omp parallel private(kk)
   for (kk=0; kk<bots_arg_size; kk++) 
#pragma omp single

#pragma omp for nowait
      for (jj=kk+1; jj<bots_arg_size; jj++)
         if (BENCH[kk*bots_arg_size+jj] != NULL)
            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
#pragma omp for
      for (ii=kk+1; ii<bots_arg_size; ii++) 
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);

#pragma omp for private(jj)
      for (ii=kk+1; ii<bots_arg_size; ii++)
         if (BENCH[ii*bots_arg_size+kk] != NULL)
            for (jj=kk+1; jj<bots_arg_size; jj++)
               if (BENCH[kk*bots_arg_size+jj] != NULL)
               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);

   bots_message(" completed!\n");
void sparselu_par_call(float **BENCH, int matrix_size, int submatrix_size)
    int ii, jj, kk;

#pragma omp parallel private(kk,ii,jj) shared(BENCH)
#pragma omp single /* nowait */
        /*#pragma omp task untied*/
        for (kk=0; kk<matrix_size; kk++)
#pragma omp task firstprivate(kk) shared(BENCH) depend(inout: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size])
            lu0(BENCH[kk*matrix_size+kk], submatrix_size);
            for (jj=kk+1; jj<matrix_size; jj++)
                if (BENCH[kk*matrix_size+jj] != NULL)
#pragma omp task firstprivate(kk, jj) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size])
                    fwd(BENCH[kk*matrix_size+kk], BENCH[kk*matrix_size+jj], submatrix_size);
            for (ii=kk+1; ii<matrix_size; ii++)
                if (BENCH[ii*matrix_size+kk] != NULL)
#pragma omp task firstprivate(kk, ii) shared(BENCH) depend(in: BENCH[kk*matrix_size+kk:submatrix_size*submatrix_size]) depend(inout: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size])
                    bdiv (BENCH[kk*matrix_size+kk], BENCH[ii*matrix_size+kk], submatrix_size);

            for (ii=kk+1; ii<matrix_size; ii++)
                if (BENCH[ii*matrix_size+kk] != NULL)
                    for (jj=kk+1; jj<matrix_size; jj++)
                        if (BENCH[kk*matrix_size+jj] != NULL)
                            if (BENCH[ii*matrix_size+jj]==NULL) BENCH[ii*matrix_size+jj] = allocate_clean_block(submatrix_size);
#pragma omp task firstprivate(kk, jj, ii) shared(BENCH) \
                            depend(in: BENCH[ii*matrix_size+kk:submatrix_size*submatrix_size], BENCH[kk*matrix_size+jj:submatrix_size*submatrix_size]) \
                            depend(inout: BENCH[ii*matrix_size+jj:submatrix_size*submatrix_size])
                            bmod(BENCH[ii*matrix_size+kk], BENCH[kk*matrix_size+jj], BENCH[ii*matrix_size+jj], submatrix_size);

#pragma omp taskwait
Exemple #9
void lu(int n, int bs, int me)
    int i, il, j, jl, k, kl;
    int I, J, K;
    double *A, *B, *C, *D;
    int dimI, dimJ, dimK;
    int strI, strJ, strK;
    unsigned int t1, t2, t3, t4, t11, t22;
    int diagowner;
    double *buf1, *buf2;

    /* temporary memories */
    buf1 = (double *)malloc(block_size*block_size*sizeof(double));
    buf2 = (double *)malloc(block_size*block_size*sizeof(double));

    for (k=0, K=0; k<n; k+=bs, K++) {
        kl = k + bs;
        if (kl > n) {
            kl = n;
            strK = kl - k;
        } else {
            strK = bs;

        /* factor diagonal block */
        diagowner = block_owner(K, K);
        if (diagowner == me) {
            A = a[K+K*nblocks];
            lu0(A, strK, strK);

        /* divide column k by diagonal block */
        if(block_owner(K, K) == me)
            D = a[K+K*nblocks];
        else {
            D = buf1;
            get_remote(D, K, K);
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            if (block_owner(I, K) == me) {  /* parcel out blocks */
                il = i + bs;
                if (il > n) {
                    il = n;
                    strI = il - i;
                } else {
                    strI = bs;
                A = a[I+K*nblocks];
                bdiv(A, D, strI, strK, strI, strK);

        /* modify row k by diagonal block */
        for (j=kl, J=K+1; j<n; j+=bs, J++) {
            if (block_owner(K, J) == me) {  /* parcel out blocks */
                jl = j+bs;
                if (jl > n) {
                    jl = n;
                    strJ = jl - j;
                } else {
                    strJ = bs;
                A = a[K+J*nblocks];
                bmodd(D, A, strK, strJ, strK, strK);


        /* modify subsequent block columns */
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            il = i+bs;
            if (il > n) {
                il = n;
                strI = il - i;
            } else {
                strI = bs;

            if(block_owner(I,K) == me)
                A = a[I+K*nblocks];
            else {
                A = buf1;
                get_remote(A, I, K);
            for (j=kl, J=K+1; j<n; j+=bs, J++) {
                jl = j + bs;
                if (jl > n) {
                    jl = n;
                    strJ= jl - j;
                } else {
                    strJ = bs;
                if (block_owner(I, J) == me) {  /* parcel out blocks */
                    if(block_owner(K,J) == me)
                        B = a[K+J*nblocks];
                    else {
                        B = buf2;
                        get_remote(B, K, J);
                    C = a[I+J*nblocks];
                    bmod(A, B, C, strI, strJ, strK, strI, strK, strI);

Exemple #10
void slave() {

  double *b;
  double *buffer;
  double *workbuf;
  int i,j,k;
  int myrow,nextrow,rownum;
  MPI_Status status;
  int ntasks,pid;

  /* get the number of the processes in application. 
 *  *      Can we define ntasks as shared variable?*/
  /* allocate the local portion of matrix */

  /* allocate buffer space, it should be big enough 
 *  *      to contaion a whole row of block. */
  buffer=(double *)malloc(matrix_size*block_size*sizeof(double));

  /* receive the initial matrix from process 0 */
  for (i=0;i<rownum;i++)


  /* do computation work of this process */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the process that owns the row i 
 *  *        to row i+block_size-1 */

    if (pid==myrank) { /* My process */
      /* factor diagonal */

      /* modify "column" by diagonal */
      for (j=myrow+block_size;j<matrix_size;j+=block_size)

      /* send this row to other processes, only need to send the column 
 *  * 	 after diagonal? */
      for (j=0;j<ntasks;j++) {
	if (j!=myrank)
    else { /* other process */
      /* receive row i to row i+block_size-1 from process pid */
    if (myrank>pid)

    /* modify the "row" using diagonal */
    for (j=nextrow;j<matrix_size;j+=block_size) 
    /* modify the internal rows and columns */
    for (j=nextrow;j<matrix_size;j+=block_size)
      for (k=i+block_size;k<matrix_size;k+=block_size) 


  /* Send b to process 0. */
  for (i=0;i<rownum;i++)
Exemple #11
void master() {
  double *a;
  double *rhs;
  int ntasks,pid;
  double *buffer;
  double *workbuf;
  int i,j,k;
  MPI_Status status;
  time_t t0,t1;
  int  ct;
  /* get the number of the processes in application. */

  /* allocate matrix, rhs vector */
  a = (double *) malloc( matrix_size*matrix_size*sizeof(double) ) ;
  rhs = (double *) malloc( matrix_size*sizeof(double) ) ;

  /* initialize the matrix */
  /* Do we need to allocate a matrix a or ony a row 
 *  *      and initilize each row and then send to corresponding process? */
  initializeMatrix( matrix_size, a, rhs );

  /* Send each row to the corresponding process. */
  for (i=block_size;i<matrix_size;i+=block_size) {

    /* send i row to i+block_size-1 row to process 
 *  *        (i mod block_size)%ntasks. */

    if (pid!=0)


  /* allocate buffer space, it should be big enough 
 *  *      to contaion a whole row of block. */
  buffer=(double *)malloc(matrix_size*block_size*sizeof(double));

  /* Do the computation work of process 0 */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the processor that own the row i 
 *  *        to row i+block_size-1 */

    if (pid==0) { /* matser process. Me! */
      /* factor diagonal */

      /* modify "column" by diagonal */
      for (j=i+block_size;j<matrix_size;j+=block_size)

      /* send this row to other processes, only need to send 
 *  * 	 the column after diagonal? */
      for (j=1;j<ntasks;j++) {
    else { /* other process */
      /* receive row i to row i+block_size-1 from process pid */

    /* modify the "row" using diagonal */
    for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks) 

    /* modify the internal rows and columns */
    for (j=i+(ntasks-pid)*block_size;j<matrix_size;j+=block_size*ntasks)
      for (k=i+block_size;k<matrix_size;k+=block_size) 



  printf("LU decomposition took %d millisecs\n", ct);

  /* Receive the modified matrix from all other processes. */
  for (i=0;i<matrix_size;i+=block_size) {

    /* compute the id of the processor that own the row i 
 *  *        to row i+block_size-1 */

    if (pid!=0)
  /* test the resulting decoposition */
Exemple #12
void lu(int n, int bs, int me)
  int i, il, j, jl, k, kl;
  int I, J, K;
  double *A, *B, *C, *D;
  int dimI, dimJ, dimK;
  int strI, strJ, strK;
  unsigned int t1, t2, t3, t4, t11, t22;
  int diagowner, destp, hc, m;
  double *dbuf;
  armci_hdl_t handle[2*MAXPROC];
  int saved[MAXPROC];  
  dbuf = (double *)ARMCI_Malloc_local((armci_size_t) block_size*block_size*sizeof(double));

  for (k=0, K=0; k<n; k+=bs, K++) {
    kl = k + bs; 
    if (kl > n) {
      kl = n;
      strK = kl - k;
    } else {
      strK = bs;
    /* factor diagonal block */
    diagowner = block_owner(K, K);
    if (diagowner == me) {
      A = a[K+K*nblocks]; 
      lu0(A, strK, strK); /* impl algo on this diag block */
    /* divide column k by diagonal block */
    if(block_owner(K, K) == me)
      D = a[K+K*nblocks];
    else {
      D = dbuf;
      get_remote(D, K, K);
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      if (block_owner(I, K) == me) {  /* parcel out blocks */
	il = i + bs; 
	if (il > n) {
	  il = n;
	  strI = il - i;
	} else {
	  strI = bs;
	A = a[I+K*nblocks]; 
	bdiv(A, D, strI, strK, strI, strK);
	/* Pre-put this block to the block-owners of all blocks on the I-th row with a non-blocking put*/
	memset (saved, 0, sizeof(saved));
	for (m = K+1; m < nblocks; m++) {
	    destp = block_owner (I, m);
	    if (destp != me && !saved[destp]) {
	      ARMCI_NbPut(A, bufc[destp*nblocks + I], strI*strK*sizeof(double), destp, NULL);
	      saved[destp] = 1;
    } /* end of for (i=k1, I=K+1...) */
    /* modify row k by diagonal block */
    for (j=kl, J=K+1; j<n; j+=bs, J++) {
      if (block_owner(K, J) == me) {  /* parcel out blocks */
	jl = j+bs; 
	if (jl > n) {
	  jl = n;
	  strJ = jl - j;
	} else {
	  strJ = bs;
	A = a[K+J*nblocks];
	bmodd(D, A, strK, strJ, strK, strK);
	/* Pre-put this block to the block-owners of all blocks on the J-th column with a non-blocking put*/
        memset (saved, 0, sizeof(saved));
        for (m = K+1; m < nblocks; m++) {
	  destp = block_owner (m, J);
	  if (destp != me  && !saved[destp]) {
	    ARMCI_NbPut(A, bufr[destp*nblocks + J], strK*strJ*sizeof(double), destp, NULL);
	    saved[destp] = 1;
    /* modify subsequent block columns */
    for (i=kl, I=K+1; i<n; i+=bs, I++) {
      il = i+bs; 
      if (il > n) {
	il = n;
	strI = il - i;
      } else {
	strI = bs;

      for (j=kl, J=K+1; j<n; j+=bs, J++) {
	jl = j + bs; 
	if (jl > n) {
	  jl = n;
	  strJ= jl - j;
	} else {
	  strJ = bs;
	if (block_owner(I, J) == me) {  /* parcel out blocks */
	  if(block_owner(I,K) == me)
	    A = a[I+K*nblocks];
	  else {
	    A = bufc[me*nblocks+I];
	  if(block_owner(K,J) == me)
	    B = a[K+J*nblocks];
	    B = bufr[me*nblocks + J];
	  C = a[I+J*nblocks];
	  bmod(A, B, C, strI, strJ, strK, strI, strK, strI);
Exemple #13
void *lu(void *lu_arg)
    int n, bs, th_idx;
    int i, il, j, jl, k, kl;
    int I, J, K;
    double *A, *B, *C, *D;
    int dimI, dimJ, dimK;
    int strI, strJ, strK;
    unsigned int t1, t2, t3, t4, t11, t22;
    int diagowner;
    double *buf1, *buf2;

    n = ((int *)lu_arg)[0];
    bs = ((int *)lu_arg)[1];
    th_idx = ((int *)lu_arg)[2];

#ifdef DEBUG
    printf("DBG: starting thread %d(idx=%d) on node %d\n", me_th[th_idx], th_idx, me); fflush(stdout);

    /* temporary memories */
    buf1 = (double *)malloc(block_size*block_size*sizeof(double));
    buf2 = (double *)malloc(block_size*block_size*sizeof(double));

    for (k=0, K=0; k<n; k+=bs, K++) {
        kl = k + bs; 
        if (kl > n) {
            kl = n;
            strK = kl - k;
        } else {
            strK = bs;

        /* factor diagonal block */
        diagowner = block_owner(K, K);
        if (diagowner == me_th[th_idx]) {
            A = a[K+K*nblocks];
            print_block_dbg(A, "th=%d, idx=%d: before lu0 a[%d]:\n", me_th[th_idx], th_idx, K+K*nblocks);
            lu0(A, strK, strK);

        /* divide column k by diagonal block */
        if(block_owner(K, K) == me_th[th_idx])
            D = a[K+K*nblocks];
        else {
            D = buf1;
            get_remote(D, K, K);
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            if (block_owner(I, K) == me_th[th_idx]) {  /* parcel out blocks */
                il = i + bs; 
                if (il > n) {
                    il = n;
                    strI = il - i;
                } else {
                    strI = bs;
                A = a[I+K*nblocks]; 
                bdiv(A, D, strI, strK, strI, strK);

        /* modify row k by diagonal block */
        for (j=kl, J=K+1; j<n; j+=bs, J++) {
            if (block_owner(K, J) == me_th[th_idx]) {  /* parcel out blocks */
                jl = j+bs; 
                if (jl > n) {
                    jl = n;
                    strJ = jl - j;
                } else {
                    strJ = bs;
                A = a[K+J*nblocks];
                bmodd(D, A, strK, strJ, strK, strK);

        /* modify subsequent block columns */
        for (i=kl, I=K+1; i<n; i+=bs, I++) {
            il = i+bs; 
            if (il > n) {
                il = n;
                strI = il - i;
            } else {
                strI = bs;

            if(block_owner(I,K) == me_th[th_idx])
                A = a[I+K*nblocks];
            else {
                A = buf1;
                get_remote(A, I, K);
            for (j=kl, J=K+1; j<n; j+=bs, J++) {
                jl = j + bs; 
                if (jl > n) {
                    jl = n;
                    strJ= jl - j;
                } else {
                    strJ = bs;
                if (block_owner(I, J) == me_th[th_idx]) {  /* parcel out blocks */
                    if(block_owner(K,J) == me_th[th_idx])
                        B = a[K+J*nblocks];
                    else {
                        B = buf2;
                        get_remote(B, K, J);
                    C = a[I+J*nblocks];
                    bmod(A, B, C, strI, strJ, strK, strI, strK, strI);


    return lu_arg;