コード例 #1
0
ファイル: oostruct.c プロジェクト: barak/lush
void lush_delete(at *p)
{
   if (!p || ZOMBIEP(p))
      return;

   class_t *cl = classof(p);
   if (cl->dontdelete)
      error(NIL, "cannot delete this object", p);
   
   run_notifiers(p);
   
   if (cl->has_compiled_part) {
      assert(isa(p, object_class));
      /* OO objects may have two parts          */
      /* lush_delete has to delete both of them */
      object_t *obj = Mptr(p);
      struct CClass_object *cobj = obj->cptr;
      oostruct_dispose(obj);
      cobj->Vtbl->Cdestroy(cobj);
      
   } else {
      if (Class(p)->dispose)
         Mptr(p) = Class(p)->dispose(Mptr(p));
      else
         Mptr(p) = NULL;
   }
   zombify(p);
}
コード例 #2
0
ファイル: oostruct.c プロジェクト: barak/lush
static const char *generic_name(at *p)
{
   if (Class(p)->classname)
      sprintf(string_buffer, "::%s:%p", NAMEOF(Class(p)->classname),Mptr(p));
   else
      sprintf(string_buffer, "::%p:%p", Class(p), Mptr(p));
   
   return mm_strdup(string_buffer);
}
コード例 #3
0
ファイル: storage.c プロジェクト: barak/lush
static void storage_serialize(at **pp, int code)
{
   storage_t *st;
   int type, kind;
   size_t size;

   if (code != SRZ_READ) {
      st = Mptr(*pp);
      type = (int)st->type;
      kind = (int)st->kind;
      size = st->size;
   }
   // Read/write basic info
   serialize_int(&type, code);
   serialize_int(&kind, code);
   serialize_size(&size, code);

   // Create storage if needed
   if (code == SRZ_READ) {
      st = new_storage_managed((storage_type_t)type, size, NIL);
      *pp = st->backptr;
   }

   // Read/write storage data
   st = Mptr(*pp);
   if (type == ST_AT) {
      at **data = st->data;
      for (int i=0; i<size; i++)
         serialize_atstar( &data[i], code);

   } else  {
      FILE *f = serialization_file_descriptor(code);
      if (code == SRZ_WRITE) {
         extern int in_bwrite;
         in_bwrite += sizeof(int) + size * storage_sizeof[type];
         write4(f, STORAGE_NORMAL);
         storage_save(st, f);
         
      } else if (code == SRZ_READ) {
         int magic = read4(f);
         storage_load(st, f);
         if (magic == STORAGE_SWAPPED)
            swap_buffer(st->data, size, storage_sizeof[type]);
         else if (magic != STORAGE_NORMAL)
            RAISEF("Corrupted binary file",NIL);
      }
   }
}
コード例 #4
0
ファイル: oostruct.c プロジェクト: barak/lush
static void make_cref_slots(object_t *obj)
{
   assert(obj->cptr);
   dhclassdoc_t *cdoc = obj->cptr->Vtbl->Cdoc;
   const class_t *cl = Mptr(cdoc->lispdata.atclass);

   /* initialize compiled slots */
   int k = cl->num_cslots;
   int j = cl->num_cslots;
   while (cl) {
      /* do slots declare in this class */
      dhrecord *drec = cl->classdoc->argdata;
      j -= drec->ndim;
      assert(j>=0);
      drec = drec + 1;
      for (int i=j; i<k; i++) {
         assert(drec->op == DHT_NAME);
         void *p = (char *)(obj->cptr)+(ptrdiff_t)(drec->arg);
         obj->slots[i] = new_cref((drec+1)->op, p);
         //assign(cl->slots[i], cl->defaults[i]);
         drec = drec->end;
      }
      assert(drec->op == DHT_END_CLASS);
      cl = cl->super;
      k = j;
   }
   assert(j==0);
}
コード例 #5
0
ファイル: storage.c プロジェクト: barak/lush
void storage_clear(storage_t *st, at *init, size_t from)
{
   /* don't need to check read-only status here because 
      it will be checked by the setat function below */
   int size = st->size;
   if (from>=size)
      RAISEF("invalid value for 'from'", NEW_NUMBER(from));
   
   /* clear from from to to */
   if (st->type == ST_AT) {
      for (int off = from; off < size; off++)
         (storage_setat[st->type])(st, off, init);
      
   } else if (storage_setat[st->type] == Number_setat) {
      get_write_permit(st);
      void (*set)(gptr, size_t, real) = storage_setd[st->type];
      for (int off = from; off < size; off++)
         set(st->data, off, Number(init));

   } else if (storage_setat[st->type] == gptr_setat) {
      get_write_permit(st);
      gptr *pt = st->data;
      for (int off=from; off<size; off++)
         pt[off] = Gptr(init);

   } else if (storage_setat[st->type] == mptr_setat) {
      get_write_permit(st);
      gptr *pt = st->data;
      for (int off=from; off<size; off++)
         pt[off] = Mptr(init);
   } else
      RAISEF("don't know how to clear this storage", st->backptr);
}
コード例 #6
0
ファイル: storage.c プロジェクト: barak/lush
static void mptr_setat(storage_t *st, size_t off, at *x)
{
   get_write_permit(st);
   ifn (MPTRP(x))
      error(NIL, "not an mptr", x);
   gptr *pt = st->data;
   pt[off] = Mptr(x);
}
コード例 #7
0
/* interface calling into the fortran routine */
static int lbfgs(index_t *x0, at *f, at *g, double gtol, htable_t *p, at *vargs)
{
   /* argument checking and setup */

   extern void lbfgs_(int *n, int *m, double *x, double *fval, double *gval, \
                      int *diagco, double *diag, int iprint[2], double *gtol, \
                      double *xtol, double *w, int *iflag);

   ifn (IND_STTYPE(x0) == ST_DOUBLE)
      error(NIL, "not an array of doubles", x0->backptr);
   ifn (Class(f)->listeval)
      error(NIL, "not a function", f);
   ifn (Class(f)->listeval)
      error(NIL, "not a function", g);
   ifn (gtol > 0)
      error(NIL, "threshold value not positive", NEW_NUMBER(gtol));
   
   at *gx = copy_array(x0)->backptr;
   at *(*listeval_f)(at *, at *) = Class(f)->listeval;
   at *(*listeval_g)(at *, at *) = Class(g)->listeval;
   at *callf = new_cons(f, new_cons(x0->backptr, vargs));
   at *callg = new_cons(g, new_cons(gx, new_cons(x0->backptr, vargs)));

   htable_t *params = lbfgs_params();
   if (p) htable_update(params, p);
   int iprint[2];
   iprint[0] = (int)Number(htable_get(params, NEW_SYMBOL("iprint-1")));
   iprint[1] = (int)Number(htable_get(params, NEW_SYMBOL("iprint-2")));
   lb3_.gtol = Number(htable_get(params, NEW_SYMBOL("ls-gtol")));
   lb3_.stpmin = Number(htable_get(params, NEW_SYMBOL("ls-stpmin")));
   lb3_.stpmax = Number(htable_get(params, NEW_SYMBOL("ls-stpmax")));
   int m = (int)Number(htable_get(params, NEW_SYMBOL("lbfgs-m")));
   int n = index_nelems(x0);
   double *x = IND_ST(x0)->data;
   double  fval;
   double *gval = IND_ST(Mptr(gx))->data;
   int diagco = false;
   double *diag = mm_blob(n*sizeof(double));
   double *w = mm_blob((n*(m+m+1)+m+m)*sizeof(double));
   double xtol = eps(1); /* machine precision */
   int iflag = 0;

   ifn (n>0)
      error(NIL, "empty array", x0->backptr);
   ifn (m>0)
      error(NIL, "m-parameter must be positive", NEW_NUMBER(m));

   /* reverse communication loop */
   do {
      fval = Number(listeval_f(Car(callf), callf));
      listeval_g(Car(callg), callg);
      lbfgs_(&n, &m, x, &fval, gval, &diagco, diag, iprint, &gtol, &xtol, w, &iflag);
      assert(iflag<2);
   } while (iflag > 0);
   
   return iflag;
}
コード例 #8
0
void cg_grad_adaptor(double *g, double *x, int n)
{
   static at *call = NIL;
   static int nx = -1;
   static storage_t *stx = NULL;
   static storage_t *stg = NULL;
   static at *(*listeval)(at *, at *) = NULL;

   if (n == -1) {
      /* initialize */
      at *x0 = var_get(named("x0"));
      at *vargs = var_get(named("vargs"));
      at *g = var_get(named("g"));
      ifn (x0)
         error(NIL, "x0 not found", NIL);
      ifn (INDEXP(x0) && IND_STTYPE((index_t *)Mptr(x0)))
         error(NIL, "x0 not a double index", x0);
      ifn (g)
         error(NIL, "g not found", NIL);
      
      listeval = Class(g)->listeval;
      index_t *ind = Mptr(x0);
      nx = storage_nelems(IND_ST(ind));
      stx = new_storage(ST_DOUBLE);
      stx->flags = STS_FOREIGN;
      stx->size = nx;
      stx->data = (char *)-1;

      stg = new_storage(ST_DOUBLE);
      stg->flags = STS_FOREIGN;
      stg->size = nx;
      stg->data = (char *)-1;
      call = new_cons(g, 
                      new_cons(NEW_INDEX(stg, IND_SHAPE(ind)),
                               new_cons(NEW_INDEX(stx, IND_SHAPE(ind)),
                                        vargs)));
   } else {
      if (n != nx)
         error(NIL, "vector of different size expected", NEW_NUMBER(n));
      stx->data = x;
      stg->data = g;
      listeval(Car(call), call);
   }
}
コード例 #9
0
ファイル: oostruct.c プロジェクト: barak/lush
static unsigned long oostruct_hash(at *p)
{
   unsigned long x = 0x87654321;
   object_t *obj = Mptr(p);
   class_t *cl = Class(p);
   x ^= hash_pointer((void*)cl);
   for(int i=0; i<cl->num_slots; i++) {
      x = (x<<1) | ((long)x<0 ? 1 : 0);
      x ^= hash_value(obj->slots[i]);
   }
   return x;
}
コード例 #10
0
ファイル: oostruct.c プロジェクト: barak/lush
object_t *object_from_cobject(struct CClass_object *cobj)
{
   ifn (cobj->Vtbl && cobj->Vtbl->Cdoc && cobj->Vtbl->Cdoc->lispdata.atclass)
      error(NIL, "attempt to access instance of unlinked class", NIL);
   assert(cobj->Vtbl);
   
   object_t *obj = cobj->__lptr;
   ifn (obj) {
      class_t *cl = Mptr(cobj->Vtbl->Cdoc->lispdata.atclass);
      obj = _new_object(cl, cobj);
   }
   return obj;
}
コード例 #11
0
ファイル: oostruct.c プロジェクト: barak/lush
static const char *class_name(at *p)
{
   class_t *cl = Mptr(p);
   
   if (cl->classname)
      sprintf(string_buffer, "::class:%s", NAMEOF(cl->classname));
   else
      sprintf(string_buffer, "::class:%p", cl);
   
   if (!cl->live)
      strcat(string_buffer, " (unlinked)");

   return mm_strdup(string_buffer);
}
コード例 #12
0
double cg_value_adaptor(double *x, int n)
{
   static at *call = NIL;
   static int nx = -1;
   static storage_t *st = NULL;
   static at *(*listeval)(at *, at *) = NULL;

   if (n == -1) {
      /* initialize */
      at *x0 = var_get(named("x0"));
      at *vargs = var_get(named("vargs"));
      at *f = var_get(named("f"));
      ifn (x0)
         error(NIL, "x0 not found", NIL);
      ifn (INDEXP(x0) && IND_STTYPE((index_t *)Mptr(x0)))
         error(NIL, "x0 not a double index", x0);
      ifn (f)
         error(NIL, "f not found", NIL);

      listeval = Class(f)->listeval;
      index_t *ind = Mptr(x0);
      nx = storage_nelems(IND_ST(ind));
      st = new_storage(ST_DOUBLE);
      st->flags = STS_FOREIGN;
      st->size = nx;
      st->data = (char *)-1;

      call = new_cons(f, new_cons(NEW_INDEX(st, IND_SHAPE(ind)), vargs));
      return NAN;
         
   } else {
      if (n != nx)
         error(NIL, "vector of different size expected", NEW_NUMBER(n));
      st->data = x;
      return Number(listeval(Car(call), call));
   }
}
コード例 #13
0
ファイル: oostruct.c プロジェクト: barak/lush
static int oostruct_compare(at *p, at *q, int order)
{
   if (order)
      error(NIL, "cannot rank objects", NIL);

   object_t *obj1 = Mptr(p);
   object_t *obj2 = Mptr(q);
   class_t *cl1 = Class(obj1->backptr);
   class_t *cl2 = Class(obj2->backptr);
   
   if (cl1->num_slots != cl2->num_slots)
      return 1;
   
   /* check that slots names match before comparing values */
   for (int i=0; i<cl1->num_slots; i++)
      if (cl1->slots[i] != cl2->slots[i])
         return 1;

   for(int i=0; i<cl1->num_slots; i++) {
      if (!eq_test(obj1->slots[i], obj2->slots[i]))
         return 1;
   }
   return 0;
}
コード例 #14
0
ファイル: storage.c プロジェクト: barak/lush
static const char *storage_name(at *p)
{
   storage_t *st = (storage_t *)Mptr(p);
   char *kind = "";
   switch (st->kind) {
   case STS_NULL:    kind = "unallocated"; break;
   case STS_MANAGED: kind = "managed"; break;
   case STS_FOREIGN: kind = "foreign"; break;
   case STS_MMAP:    kind = "mmap"; break;
   case STS_STATIC:  kind = "static"; break;
   default:
      fprintf(stderr, "internal error: invalid storage kind");
      abort();
   }
   const char *clname = nameof(Symbol(Class(p)->classname));
   sprintf(string_buffer, "::%s:%s@%p:<%"PRIdPTR">", clname, kind, st->data, st->size);
   return mm_strdup(string_buffer);
}
コード例 #15
0
ファイル: oostruct.c プロジェクト: barak/lush
at *oostruct_getslot(at *p, at *prop)
{
   at *slot = Car(prop);
   ifn (SYMBOLP(slot))
      error(NIL,"not a slot name", slot);
   prop = Cdr(prop);

   object_t *obj = Mptr(p);
   class_t *cl = Class(obj->backptr);
   for (int i=0; i<cl->num_slots; i++)
      if (slot == cl->slots[i]) {
         at *sloti = obj->slots[i];
         if (i<cl->num_cslots)
            sloti = eval(sloti);
         if (prop)
            return getslot(sloti, prop);
         else 
            return sloti;
      }
   error(NIL, "not a slot", slot);
}
コード例 #16
0
ファイル: oostruct.c プロジェクト: barak/lush
void oostruct_setslot(at *p, at *prop, at *val)
{
   at *slot = Car(prop);
   ifn (SYMBOLP(slot))
      error(NIL, "not a slot name", slot);
   prop = Cdr(prop);

   object_t *obj = Mptr(p);
   class_t *cl = Class(obj->backptr);
   for(int i=0; i<cl->num_slots; i++)
      if (slot == cl->slots[i]) {
         if (prop)
            setslot(&obj->slots[i], prop, val);
         else if (i<cl->num_cslots) {
            class_t *cl = classof(obj->slots[i]);
            cl->setslot(obj->slots[i], NIL, val);
         } else
            obj->slots[i] = val;
         return;
      }
   error(NIL, "not a slot", slot);
}
コード例 #17
0
ファイル: storage.c プロジェクト: barak/lush
static at *storage_listeval(at *p, at *q)
{
   storage_t *st = Mptr(p);
   
   if (!st->data)
      error(NIL, "unsized storage", p);
   
   q = eval_arglist(Cdr(q));
   ifn (CONSP(q) && Car(q) && NUMBERP(Car(q)))
      error(NIL, "illegal subscript", q);
   
   ssize_t off = Number(Car(q));
   if (off<0 || off>=st->size)
      error(NIL, "subscript out of range", q);
   
   if (Cdr(q)) {
      ifn (CONSP(Cdr(q)) && !Cddr(q))
         error(NIL, "one or two arguments expected",q);
      storage_setat[st->type](st, off, Cadr(q));
      return st->backptr;
   } else
      return storage_getat[st->type](st, off);
}
コード例 #18
0
ファイル: oostruct.c プロジェクト: barak/lush
at *with_object(at *p, at *f, at *q, int howfar)
{
   assert(howfar>=0);
   MM_ENTER;
   at *ans = NIL;
   if (OBJECTP(p)) {
      object_t *obj = Mptr(p);
      class_t *cl = Class(obj->backptr);
      if (howfar > cl->num_slots)
         howfar = cl->num_slots;
      
      /* push object environment */
      for (int i = 0; i<howfar; i++) {
         Symbol(cl->slots[i]) = symbol_push(Symbol(cl->slots[i]), 0 , &(obj->slots[i]));
         if (i < cl->num_cslots)
            MARKVAR_SYMBOL(Symbol(cl->slots[i]));
      }
      SYMBOL_PUSH(at_this, p);
      LOCK_SYMBOL(Symbol(at_this));

      ans = apply(f, q);
      
      /* pop object environment */
      SYMBOL_POP(at_this);
      for (int i = 0; i<howfar; i++)
         SYMBOL_POP(cl->slots[i]);
      
   } else {
      if (p == NIL)
         printf("*** Warning\007 (with-object () ...)\n");
      SYMBOL_PUSH(at_this, p);
      ans = apply(f, q);
      SYMBOL_POP(at_this);
   }
   
   MM_RETURN(ans);
}
コード例 #19
0
ファイル: HPL_pdrpanllT.c プロジェクト: davidrohr/hpl-gpu
void HPL_pdrpanllT
(
   HPL_T_panel *                    PANEL,
   const int                        M,
   const int                        N,
   const int                        ICOFF,
   double *                         WORK
)
{
/* 
 * Purpose
 * =======
 *
 * HPL_pdrpanllT recursively  factorizes  a panel of columns  using  the
 * recursive Left-looking variant of the one-dimensional algorithm.  The
 * lower  triangular  N0-by-N0  upper block  of  the panel  is stored in
 * transpose form.
 *  
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
 * operations  at once  for one column in the panel.  This  results in a
 * lower number of slightly larger  messages than usual.  On P processes
 * and assuming bi-directional links,  the running time of this function
 * can be approximated by (when N is equal to N0):                      
 *  
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
 *    N0^2 * ( M - N0/3 ) * gam2-3
 *  
 * where M is the local number of rows of  the panel, lat and bdwth  are
 * the latency and bandwidth of the network for  double  precision  real
 * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
 * rate of execution. The  recursive  algorithm  allows indeed to almost
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
 * large  number of modern machines,  this  operation is however latency
 * bound,  meaning  that its cost can  be estimated  by only the latency
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
 * communication cost.
 *
 * Arguments
 * =========
 *
 * PANEL   (local input/output)          HPL_T_panel *
 *         On entry,  PANEL  points to the data structure containing the
 *         panel information.
 *
 * M       (local input)                 const int
 *         On entry,  M specifies the local number of rows of sub(A).
 *
 * N       (local input)                 const int
 *         On entry,  N specifies the local number of columns of sub(A).
 *
 * ICOFF   (global input)                const int
 *         On entry, ICOFF specifies the row and column offset of sub(A)
 *         in A.
 *
 * WORK    (local workspace)             double *
 *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
 *
 * ---------------------------------------------------------------------
 */ 
/*
 * .. Local Variables ..
 */
   double                     * A, * Aptr, * L1, * L1ptr;
   int                        curr, ii, ioff, jb, jj, lda, m, n, n0, nb,
                              nbdiv, nbmin;
/* ..
 * .. Executable Statements ..
 */
   if( N <= ( nbmin = PANEL->algo->nbmin ) )
   { PANEL->algo->pffun( PANEL, M, N, ICOFF, WORK ); return; }
/*
 * Find  new recursive blocking factor.  To avoid an infinite loop,  one
 * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
 * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
 * including the last one that may be smaller.  nblocks  is thus  larger
 * than or equal to one, since N >= NBMIN.
 * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
 * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
 * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
 * greater than or equal to NBMIN.
 */
   nbdiv = PANEL->algo->nbdiv; ii = jj = 0; m = M; n = N;
   nb = jb = ( (((N+nbmin-1) / nbmin) + nbdiv  - 1) / nbdiv ) * nbmin;
 
   A     = PANEL->A;   lda = PANEL->lda;
   L1    = PANEL->L1;  n0  = PANEL->jb;
   L1ptr = Mptr( L1, ICOFF, ICOFF, n0 );
   curr  = (int)( PANEL->grid->myrow == PANEL->prow );

   if( curr != 0 ) Aptr = Mptr( A, ICOFF, ICOFF, lda );
   else            Aptr = Mptr( A,     0, ICOFF, lda );
/*
 * The triangular solve is replicated in every  process row.  The  panel
 * factorization is  such that  the first rows of  A  are accumulated in
 * every process row during the (panel) swapping phase.  We  ensure this
 * way a minimum amount  of communication during the entire panel facto-
 * rization.
 */
   do
   {
      n -= jb; ioff = ICOFF + jj;
/*
 * Replicated solve - Local update - Factor current panel
 */
      HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans,
                 HplUnit, jb, jj, HPL_rone, L1ptr, n0, Mptr( L1ptr,
                 jj, 0, n0 ), n0 );
      HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, m, jb,
                 jj, -HPL_rone, Mptr( Aptr, ii, 0, lda ), lda,
                 Mptr( L1ptr, jj, 0, n0 ), n0, HPL_rone,
                 Mptr( Aptr, ii, jj, lda ), lda );
      HPL_pdrpanllT( PANEL, m, jb, ioff, WORK );
/*
 * Copy back upper part of A in current process row - Go the next block
 */
      if( curr != 0 )
      {
         HPL_dlatcpy( ioff, jb, Mptr( L1, ioff, 0, n0 ), n0,
                      Mptr( A, 0, ioff, lda ), lda );
         ii += jb; m -= jb;
      }
      jj += jb; jb = Mmin( n, nb );

   } while( n > 0 );
/*
 * End of HPL_pdrpanllT
 */
}
コード例 #20
0
ファイル: HPL_pdtrsv.c プロジェクト: ddemidov/hpl-gpu
void HPL_pdtrsv(HPL_T_grid* GRID, HPL_T_pmat* AMAT)
{
/* 
 * Purpose
 * =======
 *
 * HPL_pdtrsv solves an upper triangular system of linear equations.
 *  
 * The rhs is the last column of the N by N+1 matrix A. The solve starts
 * in the process  column owning the  Nth  column of A, so the rhs b may
 * need to be moved one process column to the left at the beginning. The
 * routine therefore needs  a column  vector in every process column but
 * the one owning  b. The result is  replicated in all process rows, and
 * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes.
 *  
 * The algorithm uses decreasing one-ring broadcast in process rows  and
 * columns  implemented  in terms of  synchronous communication point to
 * point primitives.  The  lookahead of depth 1 is used to minimize  the
 * critical path. This entire operation is essentially ``latency'' bound
 * and an estimate of its running time is given by:
 *  
 *    (move rhs) lat + N / ( P bdwth ) +            
 *    (solve)    ((N / NB)-1) 2 (lat + NB / bdwth) +
 *               gam2 N^2 / ( P Q ),                
 *  
 * where  gam2   is an estimate of the   Level 2 BLAS rate of execution.
 * There are  N / NB  diagonal blocks. One must exchange  2  messages of
 * length NB to compute the next  NB  entries of the vector solution, as
 * well as performing a total of N^2 floating point operations.
 *
 * Arguments
 * =========
 *
 * GRID    (local input)                 HPL_T_grid *
 *         On entry,  GRID  points  to the data structure containing the
 *         process grid information.
 *
 * AMAT    (local input/output)          HPL_T_pmat *
 *         On entry,  AMAT  points  to the data structure containing the
 *         local array information.
 *
 * ---------------------------------------------------------------------
 */ 
//Local Variables
	MPI_Comm Ccomm, Rcomm;
	double *A=NULL, *Aprev=NULL, *Aptr, *XC=NULL, *XR=NULL, *Xd=NULL, *Xdprev=NULL, *W=NULL;
	int Alcol_matrix, Alcol_process, Alrow, Anpprev, Anp, Anq, Bcol, Cmsgid, GridIsNotPx1, GridIsNot1xQ, Rmsgid,
		colprev, kb, kbprev, lda, mycol, myrow, n, n1, n1p, n1pprev=0, nb, npcol, nprow, rowprev, tmp1, tmp2, Wsize;
	int sendcol_matrix = -1;

//Executable Statements
	HPL_ptimer_detail( HPL_TIMING_PTRSV );
	if ((n = AMAT->n) <= 0) return;
	nb = AMAT->nb;
	lda = AMAT->ld;
	A = AMAT->A;
	XR = AMAT->X;

	(void) HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
	//if (mycol >= 2) return;
	//npcol = 2;
	Rcomm = GRID->row_comm;
	Rmsgid = MSGID_BEGIN_PTRSV;
	Ccomm = GRID->col_comm;
	Cmsgid = MSGID_BEGIN_PTRSV + 1;
	GridIsNot1xQ = (nprow > 1);
	GridIsNotPx1 = (npcol > 1);

//Move the rhs in the process column owning the last column of A.
	Mnumrow(Anp, n, nb, myrow, nprow);
	Mnumcol(Anq, n, nb, mycol, GRID);

	tmp1 = (n - 1) / nb;
	Alrow = tmp1 % nprow;
	Alcol_matrix = tmp1;
	Alcol_process = MColBlockToPCol(Alcol_matrix, GRID);
	kb = n - tmp1 * nb;

	Aptr = (double *) (A);
	XC = Mptr(Aptr, 0, Anq, lda);
	Mindxg2p_col(n, nb, nb, Bcol, 0, GRID);

	if ((Anp > 0) && (Alcol_process != Bcol))
	{
		if(mycol == Bcol)
		{
			(void) HPL_send(XC, Anp, Alcol_process, Rmsgid, Rcomm);
		}
		else if(mycol == Alcol_process)
		{
			(void) HPL_recv(XC, Anp, Bcol, Rmsgid, Rcomm);
		}
	}
	Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2);
	if (mycol != Alcol_process)
	{
		for(tmp1 = 0; tmp1 < Anp; tmp1++)
		{
			XC[tmp1] = HPL_rzero;
		}
	}

//Set up lookahead
	//n1 = (npcol - 1) * nb;
	//n1 = Mmax(n1, nb);
	n1 = HPL_n1(Alcol_matrix, nb, GRID);
	Wsize = Mmin((npcol - 1) * nb, Anp);
	if (Wsize > 0)
	{
		W = (double*) malloc((size_t) Wsize * sizeof(double));
		if (W == NULL)
		{
			HPL_pabort(__LINE__, "HPL_pdtrsv", "Memory allocation failed");
		}
	}

	Anpprev = Anp;
	Xdprev = XR;
	Aprev = Aptr = Mptr(Aptr, 0, Anq, lda);
	tmp1 = n - kb;
	tmp1 -= (tmp2 = Mmin(tmp1, n1));
	MnumrowI(n1pprev, tmp2, Mmax(0, tmp1), nb, myrow, nprow);

	if (myrow == Alrow)
	{
		Anpprev = (Anp -= kb);
	}
	if (mycol == Alcol_process)
	{
		Aprev = (Aptr -= lda * kb);
		Anq -= kb;
		Xdprev = (Xd = XR + Anq);
		if (myrow == Alrow)
		{
			HPL_dtrsv(HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, kb, Aptr+Anp, lda, XC+Anp, 1);
			//fprintfqt(STD_OUT, "Process %d: dtrsv, offset %d\n", GRID->iam, Anp);
			HPL_dcopy(kb, XC+Anp, 1, Xd, 1);
		}
	}

	n -= kb;

// Start the operations
	while(n > 0)
	{
		rowprev = Alrow;
		Alrow = MModSub1(Alrow, nprow);
		colprev = Alcol_process;
		Alcol_matrix--;
		Alcol_process = MColBlockToPCol(Alcol_matrix, GRID);
		kbprev = kb;
		n1 = HPL_n1(Alcol_matrix, nb, GRID);
		tmp1 = n - (kb = nb);
		tmp1 -= (tmp2 = Mmin(tmp1, n1));
		MnumrowI(n1p, tmp2, Mmax(0, tmp1), nb, myrow, nprow);
		if(mycol == Alcol_process)
		{
			Aptr -= lda * kb;
			Anq -= kb;
			Xd = XR + Anq;
		}
		if(myrow == Alrow)
		{
			Anp -= kb;
		}
/*
 * Broadcast  (decreasing-ring)  of  previous solution block in previous
 * process column,  compute  partial update of current block and send it
 * to current process column.
 */
		if (mycol == colprev)
		{
			//Send previous solution block in process row above
			if (myrow == rowprev)
			{
				if (GridIsNot1xQ)
				{
					(void) HPL_send(Xdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm);
				}
			}
			else
			{
				(void) HPL_recv(Xdprev, kbprev, MModAdd1(myrow, nprow), Cmsgid, Ccomm);
			}
		}

		if (Alcol_process < colprev ? (mycol <= colprev && mycol > Alcol_process) : (mycol <= colprev || mycol > Alcol_process))
		{
			if (mycol == colprev)
			{
				//Compute partial update of previous solution block and send it to current column
				tmp1 = Anpprev - n1pprev;
				HPL_dgemv(HplColumnMajor, HplNoTrans, n1pprev, kbprev, -HPL_rone, Aprev+tmp1, lda, Xdprev, 1, HPL_rone, XC+tmp1, 1 );
				//fprintfqt(STD_OUT, "Process %d: dgemv %d rows starting from %d\n", GRID->iam, n1pprev, tmp1);
				sendcol_matrix = Alcol_matrix;
			}
			
			if(GridIsNotPx1)
			{
				if (sendcol_matrix != -1)
				{
					tmp2 = 1;
					while (sendcol_matrix >= tmp2 && !(GRID->col_mapping[sendcol_matrix - tmp2 + 1] > GRID->col_mapping[sendcol_matrix - tmp2] ?
						(GRID->col_mapping[sendcol_matrix - tmp2 + 1] >= mycol && GRID->col_mapping[sendcol_matrix - tmp2] <= mycol) :
						(GRID->col_mapping[sendcol_matrix - tmp2 + 1] >= mycol || GRID->col_mapping[sendcol_matrix - tmp2] <= mycol)))
					{
						tmp2++;
					}
					sendcol_matrix -= tmp2;
					tmp2 *= nb;
					MnumrowI(tmp1, tmp2, n - tmp2, nb, myrow, nprow);
					tmp2 = Anpprev - tmp1;
				}
				else
				{
					tmp2 = 0;
					tmp1 = 0;
				}
				
				//fprintfqt(STD_OUT, "Process %d: sending to %d (%d bytes starting from %d, partial update)\n", GRID->iam, Alcol_process, tmp1, tmp2);
				(void) MPI_Send(XC+tmp2, tmp1, MPI_DOUBLE, Alcol_process, Rmsgid, Rcomm);
			}
		}

		if (mycol == colprev)
		{
			//Finish the (decreasing-ring) broadcast of the solution block in previous process column
			if((myrow != rowprev) && (myrow != MModAdd1(rowprev, nprow)))
			{
				//(void) HPL_send(Xdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm);
				MPI_Send(Xdprev, kbprev, MPI_DOUBLE, MModSub1(myrow, nprow), Cmsgid, Ccomm);
			}
		}
		else if (mycol == Alcol_process)
		{
			//Current column receives and accumulates partial update of previous solution block
			for (int i = colprev;(i - mycol) % npcol != 0;i = (i + npcol - 1) % npcol)
			{
				MPI_Status tmpstatus;
				int recvsize;
				//fprintfqt(STD_OUT, "Process %d starting receive from %d (buffer %d)\n", GRID->iam, i, Wsize);
				MPI_Recv(W, Wsize, MPI_DOUBLE, i, Rmsgid, Rcomm, &tmpstatus);
				MPI_Get_count(&tmpstatus, MPI_DOUBLE, &recvsize);
				//fprintfqt(STD_OUT, "Process %d: received from %d (%d bytes starting from %d)\n", GRID->iam, i, recvsize, Anpprev - recvsize);
				HPL_daxpy(recvsize, HPL_rone, W, 1, XC+Anpprev-recvsize, 1);
			}
		}
		
		//Solve current diagonal block 
		if((mycol == Alcol_process) && (myrow == Alrow))
		{
			HPL_dtrsv(HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, kb, Aptr+Anp, lda, XC+Anp, 1);
			//fprintfqt(STD_OUT, "Process %d: dtrsv, offset %d\n", GRID->iam, Anp);
			HPL_dcopy(kb, XC+Anp, 1, XR+Anq, 1);
		}

		//Finish previous update
		if((mycol == colprev) && ((tmp1 = Anpprev - n1pprev ) > 0))
		{
			HPL_dgemv(HplColumnMajor, HplNoTrans, tmp1, kbprev, -HPL_rone, Aprev, lda, Xdprev, 1, HPL_rone, XC, 1);
			//fprintfqt(STD_OUT, "Process %d: dgemv (%d rows starting from %d, finishing)\n", GRID->iam, tmp1, 0);
		}

		//Save info of current step and update info for the next step
		if (mycol == Alcol_process)
		{
			Xdprev = Xd;
			Aprev = Aptr;
		}
		if (myrow == Alrow)
		{
			Anpprev -= kb;
		}
		n1pprev = n1p;
		n -= kb;
		Rmsgid = (Rmsgid+2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid+2);
		Cmsgid = (Cmsgid+2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV+1 : Cmsgid+2);
	}
	rowprev = Alrow;
	colprev = Alcol_process;
	kbprev = kb;

	//Replicate last solution block
	if (mycol == colprev)
	{
		(void) HPL_broadcast((void *) (XR), kbprev, HPL_DOUBLE, rowprev, Ccomm);
	}

	if (Wsize) free(W);
	HPL_ptimer_detail(HPL_TIMING_PTRSV);

	//End of HPL_pdtrsv
}
コード例 #21
0
ファイル: HPL_pdpanrlT.c プロジェクト: davidrohr/hpl-gpu
void HPL_pdpanrlT
(
   HPL_T_panel *                    PANEL,
   const int                        M,
   const int                        N,
   const int                        ICOFF,
   double *                         WORK
)
{
/* 
 * Purpose
 * =======
 *
 * HPL_pdpanrlT factorizes  a panel of columns  that is a sub-array of a
 * larger one-dimensional panel A using the Right-looking variant of the
 * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
 * block of the panel is stored in transpose form.
 *  
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
 * operations  at once  for one column in the panel.  This  results in a
 * lower number of slightly larger  messages than usual.  On P processes
 * and assuming bi-directional links,  the running time of this function
 * can be approximated by (when N is equal to N0):
 *  
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
 *    N0^2 * ( M - N0/3 ) * gam2-3
 *  
 * where M is the local number of rows of  the panel, lat and bdwth  are
 * the latency and bandwidth of the network for  double  precision  real
 * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
 * rate of execution. The  recursive  algorithm  allows indeed to almost
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
 * large  number of modern machines,  this  operation is however latency
 * bound,  meaning  that its cost can  be estimated  by only the latency
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
 * communication cost.
 *  
 * Note that  one  iteration of the the main loop is unrolled. The local
 * computation of the absolute value max of the next column is performed
 * just after its update by the current column. This allows to bring the
 * current column only  once through  cache at each  step.  The  current
 * implementation  does not perform  any blocking  for  this sequence of
 * BLAS operations, however the design allows for plugging in an optimal
 * (machine-specific) specialized  BLAS-like kernel.  This idea has been
 * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
 *
 * Arguments
 * =========
 *
 * PANEL   (local input/output)          HPL_T_panel *
 *         On entry,  PANEL  points to the data structure containing the
 *         panel information.
 *
 * M       (local input)                 const int
 *         On entry,  M specifies the local number of rows of sub(A).
 *
 * N       (local input)                 const int
 *         On entry,  N specifies the local number of columns of sub(A).
 *
 * ICOFF   (global input)                const int
 *         On entry, ICOFF specifies the row and column offset of sub(A)
 *         in A.
 *
 * WORK    (local workspace)             double *
 *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
 *
 * ---------------------------------------------------------------------
 */ 
/*
 * .. Local Variables ..
 */
   double                     * A, * Acur, * Anxt, * L1;
   int                        Mm1, Nm1, curr, ii, iip1, jj, lda, m=M,
                              n0;
/* ..
 * .. Executable Statements ..
 */
   A    = PANEL->A;   lda = PANEL->lda;
   L1   = PANEL->L1;  n0  = PANEL->jb;
   curr = (int)( PANEL->grid->myrow == PANEL->prow );

   Nm1  = N - 1; jj = ICOFF;
   if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; }
   else            { ii = 0;     iip1 = ii;   Mm1 = m;   }
/*
 * Find local absolute value max in first column - initialize WORK[0:3]
 */
   HPL_dlocmax( PANEL, m, ii, jj, WORK );

   while( Nm1 >= 1 )
   {
      Acur = Mptr( A, iip1, jj, lda ); Anxt = Mptr( Acur, 0, 1, lda );
/*
 * Swap and broadcast the current row
 */
      HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
      HPL_dlocswpT( PANEL,    ii, jj, WORK );
/*
 * Scale current column by its absolute value max entry  -  Update trai-
 * ling sub-matrix and find local absolute value max in next column (On-
 * ly one pass through cache for each current column).  This sequence of
 * operations could benefit from a specialized blocked implementation.
 */
      if( WORK[0] != HPL_rzero )
         HPL_dscal( Mm1, HPL_rone / WORK[0], Acur, 1 );
      HPL_daxpy( Mm1, -(*(Mptr( L1, jj+1, jj, n0 ))), Acur, 1, Anxt, 1 );
      HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK );

      if( Nm1 > 1 )
      {
         HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1,
                   Mptr( L1, jj+2, jj, n0 ), 1, Mptr( Anxt, 0, 1, lda ),
                   lda );
      }
      if( curr != 0 ) { ii = iip1; iip1++; m = Mm1; Mm1--; }

      Nm1--; jj++;
   }
/*
 * Swap and broadcast last row - Scale last column by its absolute value
 * max entry
 */ 
   HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
   HPL_dlocswpT( PANEL,    ii, jj, WORK );
   if( WORK[0] != HPL_rzero )
      HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
/*
 * End of HPL_pdpanrlT
 */
}
コード例 #22
0
ファイル: oostruct.c プロジェクト: barak/lush
class_t *new_ooclass(at *classname, at *atsuper, at *new_slots, at *defaults)
{
   ifn (SYMBOLP(classname))
      error(NIL, "not a valid class name", classname);
   ifn (length(new_slots) == length(defaults))
      error(NIL, "number of slots must match number of defaults", NIL);
   ifn (CLASSP(atsuper))
      error(NIL, "not a class", atsuper);

   class_t *super = Mptr(atsuper);
   if (builtin_class_p(super))
      error(NIL, "superclass not descendend of class <object>", atsuper);

   at *p = new_slots;
   at *q = defaults;
   int n = 0;
   while (CONSP(p) && CONSP(q)) {
      ifn (SYMBOLP(Car(p)))
         error(NIL, "not a symbol", Car(p));
      for (int i=0; i<super->num_slots; i++)
         if (Car(p) == super->slots[i])
            error(NIL, "a superclass has slot of this name", Car(p));
      p = Cdr(p);
      q = Cdr(q);
      n++;
   }
   assert(!p && !q);
  
   /* builds the new class */
   class_t *cl = mm_alloc(mt_class);
   *cl = *object_class;
   if (new_slots) {
      int num_slots = n + super->num_slots;
      cl->slots = mm_allocv(mt_refs, num_slots*sizeof(at *));
      cl->defaults = mm_allocv(mt_refs, num_slots*sizeof(at *));
      int i = 0;
      for (; i<super->num_slots; i++) {
         cl->slots[i] = super->slots[i];
         cl->defaults[i] = super->defaults[i];
      }
      for (at *s = new_slots, *d = defaults; CONSP(s); i++, s = Cdr(s), d = Cdr(d)) {
         cl->slots[i] = Car(s);
         cl->defaults[i] = Car(d);
      }
      assert(i == num_slots);
      cl->num_slots = num_slots;
   } else {
      cl->slots = super->slots;
      cl->defaults = super->defaults;
      cl->num_slots = super->num_slots;
   }

   /* Sets up classname */
   cl->classname = classname;
   cl->priminame = classname;
   
   /* Sets up lists */
   cl->super = super;
   cl->atsuper = atsuper;
   cl->nextclass = super->subclasses;
   cl->subclasses = 0L;
   super->subclasses = cl;
  
   /* Initialize the slots */
   cl->myslots = new_slots;
   cl->mydefaults = defaults;
   
   /* Initialize the methods and hash table */
   cl->methods = NIL;
   cl->hashtable = 0L;
   cl->hashsize = 0;
   cl->hashok = 0;
   
   /* Initialize DHCLASS stuff */
   cl->has_compiled_part = super->has_compiled_part;
   cl->num_cslots = super->num_cslots;
   cl->classdoc = 0;
   cl->kname = 0;
   
   /* Create AT and returns it */
   assert(class_class);
   cl->backptr = new_at(class_class, cl);
   return cl;
}
コード例 #23
0
ファイル: HPL_dlocswpT.c プロジェクト: ddemidov/hpl-gpu
   double                     * A1, * A2, * L, * Wr0, * Wmx;
   int                        ilindx, lda, myrow, n0, nr, nu;
   register int               i;
/* ..
 * .. Executable Statements ..
 */
   myrow = PANEL->grid->myrow; n0 = PANEL->jb; lda = PANEL->lda;

   Wr0   = ( Wmx = WORK + 4 ) + n0; Wmx[JJ] = gmax = WORK[0];
   nu    = (int)( ( (unsigned int)(n0) >> HPL_LOCSWP_LOG2_DEPTH ) 
                  << HPL_LOCSWP_LOG2_DEPTH );
   nr    = n0 - nu;
/*
 * Replicated swap and copy of the current (new) row of A into L1
 */
   L  = Mptr( PANEL->L1, 0, JJ, n0  );
/*
 * If the pivot is non-zero ...
 */
   if( gmax != HPL_rzero )
   {
/*
 * and if I own the current row of A ...
 */
      if( myrow == PANEL->prow )
      {
/*
 * and if I also own the row to be swapped with the current row of A ...
 */
         if( myrow == (int)(WORK[3]) )
         {
コード例 #24
0
ファイル: HPL_pdfact.c プロジェクト: ddemidov/hpl-gpu
void HPL_pdfact
(
   HPL_T_panel *                    PANEL
)
{
/* 
 * Purpose
 * =======
 *
 * HPL_pdfact recursively factorizes a  1-dimensional  panel of columns.
 * The  RPFACT  function pointer specifies the recursive algorithm to be
 * used, either Crout, Left- or Right looking.  NBMIN allows to vary the
 * recursive stopping criterium in terms of the number of columns in the
 * panel, and  NDIV allows to specify the number of subpanels each panel
 * should be divided into. Usuallly a value of 2 will be chosen. Finally
 * PFACT is a function pointer specifying the non-recursive algorithm to
 * to be used on at most NBMIN columns. One can also choose here between
 * Crout, Left- or Right looking.  Empirical tests seem to indicate that
 * values of 4 or 8 for NBMIN give the best results.
 *  
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
 * operations  at once  for one column in the panel.  This  results in a
 * lower number of slightly larger  messages than usual.  On P processes
 * and assuming bi-directional links,  the running time of this function
 * can be approximated by (when N is equal to N0):                      
 *  
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
 *    N0^2 * ( M - N0/3 ) * gam2-3
 *  
 * where M is the local number of rows of  the panel, lat and bdwth  are
 * the latency and bandwidth of the network for  double  precision  real
 * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
 * rate of execution. The  recursive  algorithm  allows indeed to almost
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
 * large  number of modern machines,  this  operation is however latency
 * bound,  meaning  that its cost can  be estimated  by only the latency
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
 * communication cost.
 *
 * Arguments
 * =========
 *
 * PANEL   (local input/output)          HPL_T_panel *
 *         On entry,  PANEL  points to the data structure containing the
 *         panel information.
 *
 * ---------------------------------------------------------------------
 */ 
/*
 * .. Local Variables ..
 */
   void                       * vptr = NULL;
   int                        align, jb;
/* ..
 * .. Executable Statements ..
 */
   jb = PANEL->jb; PANEL->n -= jb; PANEL->ja += jb;

   if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return;
   HPL_ptimer_detail( HPL_TIMING_RPFACT );
   VT_USER_START_A("Factorization");
   align = PANEL->algo->align;
   vptr  = (void *)malloc( ( (size_t)(align) + 
              (size_t)(((4+((unsigned int)(jb) << 1)) << 1) )) *
              sizeof(double) );
   if( vptr == NULL )
   { HPL_pabort( __LINE__, "HPL_pdfact", "Memory allocation failed" ); }
/*
 * Factor the panel - Update the panel pointers
 */
   PANEL->algo->rffun( PANEL, PANEL->mp, jb, 0, (double *)HPL_PTR( vptr,
                       ((size_t)(align) * sizeof(double) ) ) );
   if( vptr ) free( vptr );

   PANEL->A   = Mptr( PANEL->A, 0, jb, PANEL->lda );
   PANEL->nq -= jb; PANEL->jj += jb;
   VT_USER_END_A("Factorization");
   HPL_ptimer_detail( HPL_TIMING_RPFACT );
/*
 * End of HPL_pdfact
 */
}
コード例 #25
0
ファイル: HPL_pdpanel_init.c プロジェクト: davidrohr/hpl-gpu
void HPL_pdpanel_init
(
   HPL_T_grid *                     GRID,
   HPL_T_palg *                     ALGO,
   const int                        M,
   const int                        N,
   const int                        JB,
   const int                        NB,
   HPL_T_pmat *                     A,
   const int                        IA,
   const int                        JA,
   const int                        TAG,
   HPL_T_panel *                    PANEL
)
{
/* 
 * Purpose
 * =======
 *
 * HPL_pdpanel_init initializes a panel data structure.
 * 
 *
 * Arguments
 * =========
 *
 * GRID    (local input)                 HPL_T_grid *
 *         On entry,  GRID  points  to the data structure containing the
 *         process grid information.
 *
 * ALGO    (global input)                HPL_T_palg *
 *         On entry,  ALGO  points to  the data structure containing the
 *         algorithmic parameters.
 *
 * M       (local input)                 const int
 *         On entry, M specifies the global number of rows of the panel.
 *         M must be at least zero.
 *
 * N       (local input)                 const int
 *         On entry,  N  specifies  the  global number of columns of the
 *         panel and trailing submatrix. N must be at least zero.
 *
 * JB      (global input)                const int
 *         On entry, JB specifies is the number of columns of the panel.
 *         JB must be at least zero.
 *
 * A       (local input/output)          HPL_T_pmat *
 *         On entry, A points to the data structure containing the local
 *         array information.
 *
 * IA      (global input)                const int
 *         On entry,  IA  is  the global row index identifying the panel
 *         and trailing submatrix. IA must be at least zero.
 *
 * JA      (global input)                const int
 *         On entry, JA is the global column index identifying the panel
 *         and trailing submatrix. JA must be at least zero.
 *
 * TAG     (global input)                const int
 *         On entry, TAG is the row broadcast message id.
 *
 * PANEL   (local input/output)          HPL_T_panel *
 *         On entry,  PANEL  points to the data structure containing the
 *         panel information.
 *
 * ---------------------------------------------------------------------
 */ 
START_TRACE( PDPANEL_INIT )

/*
 * .. Local Variables ..
 */
   size_t                     dalign;
   int                        icurcol, icurrow, ii, itmp1, jj, lwork,
                              ml2, mp, mycol, myrow, npcol, nprow,
                              nq, nu;
   int i;
/* ..
 * .. Executable Statements ..
 */
   PANEL->grid    = GRID;                  /* ptr to the process grid */
   PANEL->algo    = ALGO;               /* ptr to the algo parameters */
   PANEL->pmat    = A;                 /* ptr to the local array info */

   myrow = GRID->myrow; mycol = GRID->mycol;
   nprow = GRID->nprow; npcol = GRID->npcol;

   HPL_infog2l( IA, JA, NB, NB, 0, 0, myrow, mycol,
                nprow, npcol, &ii, &jj, &icurrow, &icurcol, GRID );
   mp = HPL_numrowI( M, IA, NB, myrow, nprow );
   nq = HPL_numcolI( N, JA, NB, mycol, GRID );
                                         /* ptr to trailing part of A */
   PANEL->A       = Mptr( (double *)(A->A), ii, jj, A->ld );
/*
 * Workspace pointers are initialized to NULL.
 */
   PANEL->L2      = NULL; PANEL->L1      = NULL;
   PANEL->DPIV    = NULL; PANEL->DINFO   = NULL; PANEL->U       = NULL;
/*
 * Local lengths, indexes process coordinates
 */
   PANEL->nb      = NB;               /* distribution blocking factor */
   PANEL->jb      = JB;                                /* panel width */
   PANEL->m       = M;      /* global # of rows of trailing part of A */
   PANEL->n       = N;      /* global # of cols of trailing part of A */
   PANEL->ia      = IA;     /* global row index of trailing part of A */
   PANEL->ja      = JA;     /* global col index of trailing part of A */
   PANEL->mp      = mp;      /* local # of rows of trailing part of A */
   PANEL->nq      = nq;      /* local # of cols of trailing part of A */
   PANEL->ii      = ii;      /* local row index of trailing part of A */
   PANEL->jj      = jj;      /* local col index of trailing part of A */
   PANEL->lda     = A->ld;            /* local leading dim of array A */
   PANEL->prow    = icurrow; /* proc row owning 1st row of trailing A */
   PANEL->pcol    = icurcol; /* proc col owning 1st col of trailing A */
   PANEL->msgid   = TAG;     /* message id to be used for panel bcast */
/*
 * Initialize  ldl2 and len to temporary dummy values and Update tag for
 * next panel
 */
   PANEL->ldl2    = 0;               /* local leading dim of array L2 */
   PANEL->len     = 0;           /* length of the buffer to broadcast */
/*
 * Figure out the exact amount of workspace  needed by the factorization
 * and the update - Allocate that space - Finish the panel data structu-
 * re initialization.
 *
 * L1:    JB x JB in all processes
 * DPIV:  JB      in all processes
 * DINFO: 1       in all processes
 *
 * We make sure that those three arrays are contiguous in memory for the
 * later panel broadcast.  We  also  choose  to put this amount of space 
 * right  after  L2 (when it exist) so that one can receive a contiguous
 * buffer.
 */
   dalign = ALGO->align * sizeof( double );

   if( npcol == 1 )                             /* P x 1 process grid */
   {                                     /* space for L1, DPIV, DINFO */
      lwork = ALGO->align + ( PANEL->len = JB * JB + JB + 1 );
      if( nprow > 1 )                                 /* space for U */
      {
          nu = nq - JB;
          if (nu % 8) nu += 8 - nu % 8;
          if (nu % 16 == 0) nu += 8;
          lwork += JB * Mmax( 0, nu ) + ALGO->align; }

//printf("WORK1 %d of %d\n", (int) lwork, (int) panel_max_lwork);
	  if (lwork > PANEL->memalloc)
	  {
		for (i = 0;i < PANEL_PREALLOC_COUNT;i++)
		{
			if (p_lwork[i] != NULL && lwork <= panel_max_lwork)
			{
				PANEL->WORK = p_lwork[i];
				PANEL->memalloc = panel_max_lwork;
				p_lwork[i] = NULL;
				break;
			}
		}
		if (i == PANEL_PREALLOC_COUNT)
		{
			HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory");
		}
/*		if (PANEL->WORK)
		{
			 CALDGEMM_free(PANEL->WORK);
			 fprintf(STD_OUT, "WARNING, reallocating Panel memory\n");
		}
		if( !( PANEL->WORK = (void *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( double ), 0) ) )
		{
			HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" );
		}
		PANEL->memalloc = lwork;*/
	  }

/*
 * Initialize the pointers of the panel structure  -  Always re-use A in
 * the only process column
 */
      PANEL->L2    = PANEL->A + ( myrow == icurrow ? JB : 0 );
      PANEL->ldl2  = A->ld;
      PANEL->L1    = (double *)HPL_PTR( PANEL->WORK, dalign );
      PANEL->DPIV  = PANEL->L1    + JB * JB;
      PANEL->DINFO = PANEL->DPIV + JB;       *(PANEL->DINFO) = 0.0;
      PANEL->U     = ( nprow > 1 ? (double*) HPL_PTR( (PANEL->DINFO + 1), dalign ) : NULL );
   }
   else
   {                                        /* space for L2, L1, DPIV */
      ml2 = ( myrow == icurrow ? mp - JB : mp ); ml2 = Mmax( 0, ml2 );
      PANEL->len = ml2*JB + ( itmp1 = JB*JB + JB + 1 );
#ifdef HPL_COPY_L
      lwork = ALGO->align + PANEL->len;
#else
      lwork = ALGO->align + ( mycol == icurcol ? itmp1 : PANEL->len );
#endif
      if( nprow > 1 )                                 /* space for U */
      { 
         nu = ( mycol == icurcol ? nq - JB : nq );
         if (nu % 8) nu += 8 - nu % 8;
         if (nu % 16 == 0) nu += 8;
         lwork += JB * Mmax( 0, nu ) + ALGO->align;
      }

//printf("WORK2 %d of %d\n", (int) lwork, (int) panel_max_lwork);
	  if (lwork > PANEL->memalloc)
	  {
		for (i = 0;i < PANEL_PREALLOC_COUNT;i++)
		{
			if (p_lwork[i] != NULL && lwork <= panel_max_lwork)
			{
				PANEL->WORK = p_lwork[i];
				PANEL->memalloc = panel_max_lwork;
				p_lwork[i] = NULL;
				break;
			}
		}
		if (i == PANEL_PREALLOC_COUNT)
		{
			HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory");
		}
/*		if (PANEL->WORK)
		{
			 CALDGEMM_free(PANEL->WORK);
			 fprintf(STD_OUT, "WARNING, reallocating Panel memory\n");
		}
    		if( !( PANEL->WORK = (void *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( double ),0 ) ) )
    		{
        	    HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" );
    		}
		PANEL->memalloc = lwork;*/
	  }
/*
 * Initialize the pointers of the panel structure - Re-use A in the cur-
 * rent process column when HPL_COPY_L is not defined.
 */
#ifdef HPL_COPY_L
      PANEL->L2    = (double *)HPL_PTR( PANEL->WORK, dalign );
      PANEL->ldl2  = Mmax( 1, ml2 );
      PANEL->L1    = PANEL->L2 + ml2 * JB;
#else
      if( mycol == icurcol )
      {
         PANEL->L2   = PANEL->A + ( myrow == icurrow ? JB : 0 );
         PANEL->ldl2 = A->ld;
         PANEL->L1   = (double *)HPL_PTR( PANEL->WORK, dalign );
      }
      else
      {
         PANEL->L2   = (double *)HPL_PTR( PANEL->WORK, dalign );
         PANEL->ldl2 = Mmax( 1, ml2 );
         PANEL->L1   = PANEL->L2 + ml2 * JB;
      } 
#endif
      PANEL->DPIV  = PANEL->L1   + JB * JB;
      PANEL->DINFO = PANEL->DPIV + JB;     *(PANEL->DINFO) = 0.0;
      if (nprow > 1) {
          PANEL->U     = (double *)HPL_PTR( (PANEL->DINFO + 1), dalign );
      }
   }
/*
 * If nprow is 1, we just allocate an array of JB integers for the swap.
 * When nprow > 1, we allocate the space for the index arrays immediate-
 * ly. The exact size of this array depends on the swapping routine that
 * will be used, so we allocate the maximum:
 *
 *    IWORK[0] is of size at most 1      +
 *    IPL      is of size at most 1      +
 *    IPID     is of size at most 4 * JB +
 *
 *    For HPL_pdlaswp00:
 *       lindxA   is of size at most 2 * JB +
 *       lindxAU  is of size at most 2 * JB +
 *       llen     is of size at most NPROW  +
 *       llen_sv  is of size at most NPROW.
 *
 *    For HPL_pdlaswp01:
 *       ipA      is of size ar most 1      +
 *       lindxA   is of size at most 2 * JB +
 *       lindxAU  is of size at most 2 * JB +
 *       iplen    is of size at most NPROW  + 1 +
 *       ipmap    is of size at most NPROW  +
 *       ipmapm1  is of size at most NPROW  +
 *       permU    is of size at most JB     +
 *       iwork    is of size at most MAX( 2*JB, NPROW+1 ).
 *
 * that is  3 + 8*JB + MAX(2*NPROW, 3*NPROW+1+JB+MAX(2*JB,NPROW+1))
 *       =  4 + 9*JB + 3*NPROW + MAX( 2*JB, NPROW+1 ).
 *
 * We use the fist entry of this to work array  to indicate  whether the
 * the  local  index arrays have already been computed,  and if yes,  by
 * which function:
 *    IWORK[0] = -1: no index arrays have been computed so far;
 *    IWORK[0] =  0: HPL_pdlaswp00 already computed those arrays;
 *    IWORK[0] =  1: HPL_pdlaswp01 already computed those arrays;
 * This allows to save some redundant and useless computations.
 */
   if( nprow == 1 ) { lwork = JB; }
   else             
   {
      itmp1 = (JB << 1); lwork = nprow + 1; itmp1 = Mmax( itmp1, lwork );
      lwork = 4 + (9 * JB) + (3 * nprow) + itmp1;
   }

//printf("IWORK3 %d of %d\n", (int) lwork, (int) panel_max_ilwork);
   if (lwork > PANEL->memallocI)
   {
     /*if (PANEL->IWORK)
	 {
		 CALDGEMM_free(PANEL->IWORK);
		 fprintf(STD_OUT, "WARNING, reallocating Panel memory\n");
	 }
         PANEL->IWORK = (int *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( int ), 0 );
	 PANEL->memallocI = lwork;*/
	 
		for (i = 0;i < PANEL_PREALLOC_COUNT;i++)
		{
			if (p_ilwork[i] != NULL && lwork <= panel_max_ilwork)
			{
				PANEL->IWORK = (int*) p_ilwork[i];
				PANEL->memallocI = panel_max_ilwork;
				p_ilwork[i] = NULL;
				break;
			}
		}
		if (i == PANEL_PREALLOC_COUNT)
		{
			HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory");
		}

   }
   if( PANEL->IWORK == NULL )
   { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); }
                       /* Initialize the first entry of the workarray */
   *(PANEL->IWORK) = -1;

END_TRACE

/*
 * End of HPL_pdpanel_init
 */
}