void lush_delete(at *p) { if (!p || ZOMBIEP(p)) return; class_t *cl = classof(p); if (cl->dontdelete) error(NIL, "cannot delete this object", p); run_notifiers(p); if (cl->has_compiled_part) { assert(isa(p, object_class)); /* OO objects may have two parts */ /* lush_delete has to delete both of them */ object_t *obj = Mptr(p); struct CClass_object *cobj = obj->cptr; oostruct_dispose(obj); cobj->Vtbl->Cdestroy(cobj); } else { if (Class(p)->dispose) Mptr(p) = Class(p)->dispose(Mptr(p)); else Mptr(p) = NULL; } zombify(p); }
static const char *generic_name(at *p) { if (Class(p)->classname) sprintf(string_buffer, "::%s:%p", NAMEOF(Class(p)->classname),Mptr(p)); else sprintf(string_buffer, "::%p:%p", Class(p), Mptr(p)); return mm_strdup(string_buffer); }
static void storage_serialize(at **pp, int code) { storage_t *st; int type, kind; size_t size; if (code != SRZ_READ) { st = Mptr(*pp); type = (int)st->type; kind = (int)st->kind; size = st->size; } // Read/write basic info serialize_int(&type, code); serialize_int(&kind, code); serialize_size(&size, code); // Create storage if needed if (code == SRZ_READ) { st = new_storage_managed((storage_type_t)type, size, NIL); *pp = st->backptr; } // Read/write storage data st = Mptr(*pp); if (type == ST_AT) { at **data = st->data; for (int i=0; i<size; i++) serialize_atstar( &data[i], code); } else { FILE *f = serialization_file_descriptor(code); if (code == SRZ_WRITE) { extern int in_bwrite; in_bwrite += sizeof(int) + size * storage_sizeof[type]; write4(f, STORAGE_NORMAL); storage_save(st, f); } else if (code == SRZ_READ) { int magic = read4(f); storage_load(st, f); if (magic == STORAGE_SWAPPED) swap_buffer(st->data, size, storage_sizeof[type]); else if (magic != STORAGE_NORMAL) RAISEF("Corrupted binary file",NIL); } } }
static void make_cref_slots(object_t *obj) { assert(obj->cptr); dhclassdoc_t *cdoc = obj->cptr->Vtbl->Cdoc; const class_t *cl = Mptr(cdoc->lispdata.atclass); /* initialize compiled slots */ int k = cl->num_cslots; int j = cl->num_cslots; while (cl) { /* do slots declare in this class */ dhrecord *drec = cl->classdoc->argdata; j -= drec->ndim; assert(j>=0); drec = drec + 1; for (int i=j; i<k; i++) { assert(drec->op == DHT_NAME); void *p = (char *)(obj->cptr)+(ptrdiff_t)(drec->arg); obj->slots[i] = new_cref((drec+1)->op, p); //assign(cl->slots[i], cl->defaults[i]); drec = drec->end; } assert(drec->op == DHT_END_CLASS); cl = cl->super; k = j; } assert(j==0); }
void storage_clear(storage_t *st, at *init, size_t from) { /* don't need to check read-only status here because it will be checked by the setat function below */ int size = st->size; if (from>=size) RAISEF("invalid value for 'from'", NEW_NUMBER(from)); /* clear from from to to */ if (st->type == ST_AT) { for (int off = from; off < size; off++) (storage_setat[st->type])(st, off, init); } else if (storage_setat[st->type] == Number_setat) { get_write_permit(st); void (*set)(gptr, size_t, real) = storage_setd[st->type]; for (int off = from; off < size; off++) set(st->data, off, Number(init)); } else if (storage_setat[st->type] == gptr_setat) { get_write_permit(st); gptr *pt = st->data; for (int off=from; off<size; off++) pt[off] = Gptr(init); } else if (storage_setat[st->type] == mptr_setat) { get_write_permit(st); gptr *pt = st->data; for (int off=from; off<size; off++) pt[off] = Mptr(init); } else RAISEF("don't know how to clear this storage", st->backptr); }
static void mptr_setat(storage_t *st, size_t off, at *x) { get_write_permit(st); ifn (MPTRP(x)) error(NIL, "not an mptr", x); gptr *pt = st->data; pt[off] = Mptr(x); }
/* interface calling into the fortran routine */ static int lbfgs(index_t *x0, at *f, at *g, double gtol, htable_t *p, at *vargs) { /* argument checking and setup */ extern void lbfgs_(int *n, int *m, double *x, double *fval, double *gval, \ int *diagco, double *diag, int iprint[2], double *gtol, \ double *xtol, double *w, int *iflag); ifn (IND_STTYPE(x0) == ST_DOUBLE) error(NIL, "not an array of doubles", x0->backptr); ifn (Class(f)->listeval) error(NIL, "not a function", f); ifn (Class(f)->listeval) error(NIL, "not a function", g); ifn (gtol > 0) error(NIL, "threshold value not positive", NEW_NUMBER(gtol)); at *gx = copy_array(x0)->backptr; at *(*listeval_f)(at *, at *) = Class(f)->listeval; at *(*listeval_g)(at *, at *) = Class(g)->listeval; at *callf = new_cons(f, new_cons(x0->backptr, vargs)); at *callg = new_cons(g, new_cons(gx, new_cons(x0->backptr, vargs))); htable_t *params = lbfgs_params(); if (p) htable_update(params, p); int iprint[2]; iprint[0] = (int)Number(htable_get(params, NEW_SYMBOL("iprint-1"))); iprint[1] = (int)Number(htable_get(params, NEW_SYMBOL("iprint-2"))); lb3_.gtol = Number(htable_get(params, NEW_SYMBOL("ls-gtol"))); lb3_.stpmin = Number(htable_get(params, NEW_SYMBOL("ls-stpmin"))); lb3_.stpmax = Number(htable_get(params, NEW_SYMBOL("ls-stpmax"))); int m = (int)Number(htable_get(params, NEW_SYMBOL("lbfgs-m"))); int n = index_nelems(x0); double *x = IND_ST(x0)->data; double fval; double *gval = IND_ST(Mptr(gx))->data; int diagco = false; double *diag = mm_blob(n*sizeof(double)); double *w = mm_blob((n*(m+m+1)+m+m)*sizeof(double)); double xtol = eps(1); /* machine precision */ int iflag = 0; ifn (n>0) error(NIL, "empty array", x0->backptr); ifn (m>0) error(NIL, "m-parameter must be positive", NEW_NUMBER(m)); /* reverse communication loop */ do { fval = Number(listeval_f(Car(callf), callf)); listeval_g(Car(callg), callg); lbfgs_(&n, &m, x, &fval, gval, &diagco, diag, iprint, >ol, &xtol, w, &iflag); assert(iflag<2); } while (iflag > 0); return iflag; }
void cg_grad_adaptor(double *g, double *x, int n) { static at *call = NIL; static int nx = -1; static storage_t *stx = NULL; static storage_t *stg = NULL; static at *(*listeval)(at *, at *) = NULL; if (n == -1) { /* initialize */ at *x0 = var_get(named("x0")); at *vargs = var_get(named("vargs")); at *g = var_get(named("g")); ifn (x0) error(NIL, "x0 not found", NIL); ifn (INDEXP(x0) && IND_STTYPE((index_t *)Mptr(x0))) error(NIL, "x0 not a double index", x0); ifn (g) error(NIL, "g not found", NIL); listeval = Class(g)->listeval; index_t *ind = Mptr(x0); nx = storage_nelems(IND_ST(ind)); stx = new_storage(ST_DOUBLE); stx->flags = STS_FOREIGN; stx->size = nx; stx->data = (char *)-1; stg = new_storage(ST_DOUBLE); stg->flags = STS_FOREIGN; stg->size = nx; stg->data = (char *)-1; call = new_cons(g, new_cons(NEW_INDEX(stg, IND_SHAPE(ind)), new_cons(NEW_INDEX(stx, IND_SHAPE(ind)), vargs))); } else { if (n != nx) error(NIL, "vector of different size expected", NEW_NUMBER(n)); stx->data = x; stg->data = g; listeval(Car(call), call); } }
static unsigned long oostruct_hash(at *p) { unsigned long x = 0x87654321; object_t *obj = Mptr(p); class_t *cl = Class(p); x ^= hash_pointer((void*)cl); for(int i=0; i<cl->num_slots; i++) { x = (x<<1) | ((long)x<0 ? 1 : 0); x ^= hash_value(obj->slots[i]); } return x; }
object_t *object_from_cobject(struct CClass_object *cobj) { ifn (cobj->Vtbl && cobj->Vtbl->Cdoc && cobj->Vtbl->Cdoc->lispdata.atclass) error(NIL, "attempt to access instance of unlinked class", NIL); assert(cobj->Vtbl); object_t *obj = cobj->__lptr; ifn (obj) { class_t *cl = Mptr(cobj->Vtbl->Cdoc->lispdata.atclass); obj = _new_object(cl, cobj); } return obj; }
static const char *class_name(at *p) { class_t *cl = Mptr(p); if (cl->classname) sprintf(string_buffer, "::class:%s", NAMEOF(cl->classname)); else sprintf(string_buffer, "::class:%p", cl); if (!cl->live) strcat(string_buffer, " (unlinked)"); return mm_strdup(string_buffer); }
double cg_value_adaptor(double *x, int n) { static at *call = NIL; static int nx = -1; static storage_t *st = NULL; static at *(*listeval)(at *, at *) = NULL; if (n == -1) { /* initialize */ at *x0 = var_get(named("x0")); at *vargs = var_get(named("vargs")); at *f = var_get(named("f")); ifn (x0) error(NIL, "x0 not found", NIL); ifn (INDEXP(x0) && IND_STTYPE((index_t *)Mptr(x0))) error(NIL, "x0 not a double index", x0); ifn (f) error(NIL, "f not found", NIL); listeval = Class(f)->listeval; index_t *ind = Mptr(x0); nx = storage_nelems(IND_ST(ind)); st = new_storage(ST_DOUBLE); st->flags = STS_FOREIGN; st->size = nx; st->data = (char *)-1; call = new_cons(f, new_cons(NEW_INDEX(st, IND_SHAPE(ind)), vargs)); return NAN; } else { if (n != nx) error(NIL, "vector of different size expected", NEW_NUMBER(n)); st->data = x; return Number(listeval(Car(call), call)); } }
static int oostruct_compare(at *p, at *q, int order) { if (order) error(NIL, "cannot rank objects", NIL); object_t *obj1 = Mptr(p); object_t *obj2 = Mptr(q); class_t *cl1 = Class(obj1->backptr); class_t *cl2 = Class(obj2->backptr); if (cl1->num_slots != cl2->num_slots) return 1; /* check that slots names match before comparing values */ for (int i=0; i<cl1->num_slots; i++) if (cl1->slots[i] != cl2->slots[i]) return 1; for(int i=0; i<cl1->num_slots; i++) { if (!eq_test(obj1->slots[i], obj2->slots[i])) return 1; } return 0; }
static const char *storage_name(at *p) { storage_t *st = (storage_t *)Mptr(p); char *kind = ""; switch (st->kind) { case STS_NULL: kind = "unallocated"; break; case STS_MANAGED: kind = "managed"; break; case STS_FOREIGN: kind = "foreign"; break; case STS_MMAP: kind = "mmap"; break; case STS_STATIC: kind = "static"; break; default: fprintf(stderr, "internal error: invalid storage kind"); abort(); } const char *clname = nameof(Symbol(Class(p)->classname)); sprintf(string_buffer, "::%s:%s@%p:<%"PRIdPTR">", clname, kind, st->data, st->size); return mm_strdup(string_buffer); }
at *oostruct_getslot(at *p, at *prop) { at *slot = Car(prop); ifn (SYMBOLP(slot)) error(NIL,"not a slot name", slot); prop = Cdr(prop); object_t *obj = Mptr(p); class_t *cl = Class(obj->backptr); for (int i=0; i<cl->num_slots; i++) if (slot == cl->slots[i]) { at *sloti = obj->slots[i]; if (i<cl->num_cslots) sloti = eval(sloti); if (prop) return getslot(sloti, prop); else return sloti; } error(NIL, "not a slot", slot); }
void oostruct_setslot(at *p, at *prop, at *val) { at *slot = Car(prop); ifn (SYMBOLP(slot)) error(NIL, "not a slot name", slot); prop = Cdr(prop); object_t *obj = Mptr(p); class_t *cl = Class(obj->backptr); for(int i=0; i<cl->num_slots; i++) if (slot == cl->slots[i]) { if (prop) setslot(&obj->slots[i], prop, val); else if (i<cl->num_cslots) { class_t *cl = classof(obj->slots[i]); cl->setslot(obj->slots[i], NIL, val); } else obj->slots[i] = val; return; } error(NIL, "not a slot", slot); }
static at *storage_listeval(at *p, at *q) { storage_t *st = Mptr(p); if (!st->data) error(NIL, "unsized storage", p); q = eval_arglist(Cdr(q)); ifn (CONSP(q) && Car(q) && NUMBERP(Car(q))) error(NIL, "illegal subscript", q); ssize_t off = Number(Car(q)); if (off<0 || off>=st->size) error(NIL, "subscript out of range", q); if (Cdr(q)) { ifn (CONSP(Cdr(q)) && !Cddr(q)) error(NIL, "one or two arguments expected",q); storage_setat[st->type](st, off, Cadr(q)); return st->backptr; } else return storage_getat[st->type](st, off); }
at *with_object(at *p, at *f, at *q, int howfar) { assert(howfar>=0); MM_ENTER; at *ans = NIL; if (OBJECTP(p)) { object_t *obj = Mptr(p); class_t *cl = Class(obj->backptr); if (howfar > cl->num_slots) howfar = cl->num_slots; /* push object environment */ for (int i = 0; i<howfar; i++) { Symbol(cl->slots[i]) = symbol_push(Symbol(cl->slots[i]), 0 , &(obj->slots[i])); if (i < cl->num_cslots) MARKVAR_SYMBOL(Symbol(cl->slots[i])); } SYMBOL_PUSH(at_this, p); LOCK_SYMBOL(Symbol(at_this)); ans = apply(f, q); /* pop object environment */ SYMBOL_POP(at_this); for (int i = 0; i<howfar; i++) SYMBOL_POP(cl->slots[i]); } else { if (p == NIL) printf("*** Warning\007 (with-object () ...)\n"); SYMBOL_PUSH(at_this, p); ans = apply(f, q); SYMBOL_POP(at_this); } MM_RETURN(ans); }
void HPL_pdrpanllT ( HPL_T_panel * PANEL, const int M, const int N, const int ICOFF, double * WORK ) { /* * Purpose * ======= * * HPL_pdrpanllT recursively factorizes a panel of columns using the * recursive Left-looking variant of the one-dimensional algorithm. The * lower triangular N0-by-N0 upper block of the panel is stored in * transpose form. * * Bi-directional exchange is used to perform the swap::broadcast * operations at once for one column in the panel. This results in a * lower number of slightly larger messages than usual. On P processes * and assuming bi-directional links, the running time of this function * can be approximated by (when N is equal to N0): * * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + * N0^2 * ( M - N0/3 ) * gam2-3 * * where M is the local number of rows of the panel, lat and bdwth are * the latency and bandwidth of the network for double precision real * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS * rate of execution. The recursive algorithm allows indeed to almost * achieve Level 3 BLAS performance in the panel factorization. On a * large number of modern machines, this operation is however latency * bound, meaning that its cost can be estimated by only the latency * portion N0 * log_2(P) * lat. Mono-directional links will double this * communication cost. * * Arguments * ========= * * PANEL (local input/output) HPL_T_panel * * On entry, PANEL points to the data structure containing the * panel information. * * M (local input) const int * On entry, M specifies the local number of rows of sub(A). * * N (local input) const int * On entry, N specifies the local number of columns of sub(A). * * ICOFF (global input) const int * On entry, ICOFF specifies the row and column offset of sub(A) * in A. * * WORK (local workspace) double * * On entry, WORK is a workarray of size at least 2*(4+2*N0). * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ double * A, * Aptr, * L1, * L1ptr; int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; /* .. * .. Executable Statements .. */ if( N <= ( nbmin = PANEL->algo->nbmin ) ) { PANEL->algo->pffun( PANEL, M, N, ICOFF, WORK ); return; } /* * Find new recursive blocking factor. To avoid an infinite loop, one * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. * First, we compute nblocks: the number of blocks of size NBMIN in N, * including the last one that may be smaller. nblocks is thus larger * than or equal to one, since N >= NBMIN. * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal * to one as well. For NDIV >= 2, we are guaranteed that the quan- * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and * greater than or equal to NBMIN. */ nbdiv = PANEL->algo->nbdiv; ii = jj = 0; m = M; n = N; nb = jb = ( (((N+nbmin-1) / nbmin) + nbdiv - 1) / nbdiv ) * nbmin; A = PANEL->A; lda = PANEL->lda; L1 = PANEL->L1; n0 = PANEL->jb; L1ptr = Mptr( L1, ICOFF, ICOFF, n0 ); curr = (int)( PANEL->grid->myrow == PANEL->prow ); if( curr != 0 ) Aptr = Mptr( A, ICOFF, ICOFF, lda ); else Aptr = Mptr( A, 0, ICOFF, lda ); /* * The triangular solve is replicated in every process row. The panel * factorization is such that the first rows of A are accumulated in * every process row during the (panel) swapping phase. We ensure this * way a minimum amount of communication during the entire panel facto- * rization. */ do { n -= jb; ioff = ICOFF + jj; /* * Replicated solve - Local update - Factor current panel */ HPL_dtrsm( HplColumnMajor, HplRight, HplUpper, HplNoTrans, HplUnit, jb, jj, HPL_rone, L1ptr, n0, Mptr( L1ptr, jj, 0, n0 ), n0 ); HPL_dgemm( HplColumnMajor, HplNoTrans, HplTrans, m, jb, jj, -HPL_rone, Mptr( Aptr, ii, 0, lda ), lda, Mptr( L1ptr, jj, 0, n0 ), n0, HPL_rone, Mptr( Aptr, ii, jj, lda ), lda ); HPL_pdrpanllT( PANEL, m, jb, ioff, WORK ); /* * Copy back upper part of A in current process row - Go the next block */ if( curr != 0 ) { HPL_dlatcpy( ioff, jb, Mptr( L1, ioff, 0, n0 ), n0, Mptr( A, 0, ioff, lda ), lda ); ii += jb; m -= jb; } jj += jb; jb = Mmin( n, nb ); } while( n > 0 ); /* * End of HPL_pdrpanllT */ }
void HPL_pdtrsv(HPL_T_grid* GRID, HPL_T_pmat* AMAT) { /* * Purpose * ======= * * HPL_pdtrsv solves an upper triangular system of linear equations. * * The rhs is the last column of the N by N+1 matrix A. The solve starts * in the process column owning the Nth column of A, so the rhs b may * need to be moved one process column to the left at the beginning. The * routine therefore needs a column vector in every process column but * the one owning b. The result is replicated in all process rows, and * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes. * * The algorithm uses decreasing one-ring broadcast in process rows and * columns implemented in terms of synchronous communication point to * point primitives. The lookahead of depth 1 is used to minimize the * critical path. This entire operation is essentially ``latency'' bound * and an estimate of its running time is given by: * * (move rhs) lat + N / ( P bdwth ) + * (solve) ((N / NB)-1) 2 (lat + NB / bdwth) + * gam2 N^2 / ( P Q ), * * where gam2 is an estimate of the Level 2 BLAS rate of execution. * There are N / NB diagonal blocks. One must exchange 2 messages of * length NB to compute the next NB entries of the vector solution, as * well as performing a total of N^2 floating point operations. * * Arguments * ========= * * GRID (local input) HPL_T_grid * * On entry, GRID points to the data structure containing the * process grid information. * * AMAT (local input/output) HPL_T_pmat * * On entry, AMAT points to the data structure containing the * local array information. * * --------------------------------------------------------------------- */ //Local Variables MPI_Comm Ccomm, Rcomm; double *A=NULL, *Aprev=NULL, *Aptr, *XC=NULL, *XR=NULL, *Xd=NULL, *Xdprev=NULL, *W=NULL; int Alcol_matrix, Alcol_process, Alrow, Anpprev, Anp, Anq, Bcol, Cmsgid, GridIsNotPx1, GridIsNot1xQ, Rmsgid, colprev, kb, kbprev, lda, mycol, myrow, n, n1, n1p, n1pprev=0, nb, npcol, nprow, rowprev, tmp1, tmp2, Wsize; int sendcol_matrix = -1; //Executable Statements HPL_ptimer_detail( HPL_TIMING_PTRSV ); if ((n = AMAT->n) <= 0) return; nb = AMAT->nb; lda = AMAT->ld; A = AMAT->A; XR = AMAT->X; (void) HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); //if (mycol >= 2) return; //npcol = 2; Rcomm = GRID->row_comm; Rmsgid = MSGID_BEGIN_PTRSV; Ccomm = GRID->col_comm; Cmsgid = MSGID_BEGIN_PTRSV + 1; GridIsNot1xQ = (nprow > 1); GridIsNotPx1 = (npcol > 1); //Move the rhs in the process column owning the last column of A. Mnumrow(Anp, n, nb, myrow, nprow); Mnumcol(Anq, n, nb, mycol, GRID); tmp1 = (n - 1) / nb; Alrow = tmp1 % nprow; Alcol_matrix = tmp1; Alcol_process = MColBlockToPCol(Alcol_matrix, GRID); kb = n - tmp1 * nb; Aptr = (double *) (A); XC = Mptr(Aptr, 0, Anq, lda); Mindxg2p_col(n, nb, nb, Bcol, 0, GRID); if ((Anp > 0) && (Alcol_process != Bcol)) { if(mycol == Bcol) { (void) HPL_send(XC, Anp, Alcol_process, Rmsgid, Rcomm); } else if(mycol == Alcol_process) { (void) HPL_recv(XC, Anp, Bcol, Rmsgid, Rcomm); } } Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2); if (mycol != Alcol_process) { for(tmp1 = 0; tmp1 < Anp; tmp1++) { XC[tmp1] = HPL_rzero; } } //Set up lookahead //n1 = (npcol - 1) * nb; //n1 = Mmax(n1, nb); n1 = HPL_n1(Alcol_matrix, nb, GRID); Wsize = Mmin((npcol - 1) * nb, Anp); if (Wsize > 0) { W = (double*) malloc((size_t) Wsize * sizeof(double)); if (W == NULL) { HPL_pabort(__LINE__, "HPL_pdtrsv", "Memory allocation failed"); } } Anpprev = Anp; Xdprev = XR; Aprev = Aptr = Mptr(Aptr, 0, Anq, lda); tmp1 = n - kb; tmp1 -= (tmp2 = Mmin(tmp1, n1)); MnumrowI(n1pprev, tmp2, Mmax(0, tmp1), nb, myrow, nprow); if (myrow == Alrow) { Anpprev = (Anp -= kb); } if (mycol == Alcol_process) { Aprev = (Aptr -= lda * kb); Anq -= kb; Xdprev = (Xd = XR + Anq); if (myrow == Alrow) { HPL_dtrsv(HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, kb, Aptr+Anp, lda, XC+Anp, 1); //fprintfqt(STD_OUT, "Process %d: dtrsv, offset %d\n", GRID->iam, Anp); HPL_dcopy(kb, XC+Anp, 1, Xd, 1); } } n -= kb; // Start the operations while(n > 0) { rowprev = Alrow; Alrow = MModSub1(Alrow, nprow); colprev = Alcol_process; Alcol_matrix--; Alcol_process = MColBlockToPCol(Alcol_matrix, GRID); kbprev = kb; n1 = HPL_n1(Alcol_matrix, nb, GRID); tmp1 = n - (kb = nb); tmp1 -= (tmp2 = Mmin(tmp1, n1)); MnumrowI(n1p, tmp2, Mmax(0, tmp1), nb, myrow, nprow); if(mycol == Alcol_process) { Aptr -= lda * kb; Anq -= kb; Xd = XR + Anq; } if(myrow == Alrow) { Anp -= kb; } /* * Broadcast (decreasing-ring) of previous solution block in previous * process column, compute partial update of current block and send it * to current process column. */ if (mycol == colprev) { //Send previous solution block in process row above if (myrow == rowprev) { if (GridIsNot1xQ) { (void) HPL_send(Xdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm); } } else { (void) HPL_recv(Xdprev, kbprev, MModAdd1(myrow, nprow), Cmsgid, Ccomm); } } if (Alcol_process < colprev ? (mycol <= colprev && mycol > Alcol_process) : (mycol <= colprev || mycol > Alcol_process)) { if (mycol == colprev) { //Compute partial update of previous solution block and send it to current column tmp1 = Anpprev - n1pprev; HPL_dgemv(HplColumnMajor, HplNoTrans, n1pprev, kbprev, -HPL_rone, Aprev+tmp1, lda, Xdprev, 1, HPL_rone, XC+tmp1, 1 ); //fprintfqt(STD_OUT, "Process %d: dgemv %d rows starting from %d\n", GRID->iam, n1pprev, tmp1); sendcol_matrix = Alcol_matrix; } if(GridIsNotPx1) { if (sendcol_matrix != -1) { tmp2 = 1; while (sendcol_matrix >= tmp2 && !(GRID->col_mapping[sendcol_matrix - tmp2 + 1] > GRID->col_mapping[sendcol_matrix - tmp2] ? (GRID->col_mapping[sendcol_matrix - tmp2 + 1] >= mycol && GRID->col_mapping[sendcol_matrix - tmp2] <= mycol) : (GRID->col_mapping[sendcol_matrix - tmp2 + 1] >= mycol || GRID->col_mapping[sendcol_matrix - tmp2] <= mycol))) { tmp2++; } sendcol_matrix -= tmp2; tmp2 *= nb; MnumrowI(tmp1, tmp2, n - tmp2, nb, myrow, nprow); tmp2 = Anpprev - tmp1; } else { tmp2 = 0; tmp1 = 0; } //fprintfqt(STD_OUT, "Process %d: sending to %d (%d bytes starting from %d, partial update)\n", GRID->iam, Alcol_process, tmp1, tmp2); (void) MPI_Send(XC+tmp2, tmp1, MPI_DOUBLE, Alcol_process, Rmsgid, Rcomm); } } if (mycol == colprev) { //Finish the (decreasing-ring) broadcast of the solution block in previous process column if((myrow != rowprev) && (myrow != MModAdd1(rowprev, nprow))) { //(void) HPL_send(Xdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm); MPI_Send(Xdprev, kbprev, MPI_DOUBLE, MModSub1(myrow, nprow), Cmsgid, Ccomm); } } else if (mycol == Alcol_process) { //Current column receives and accumulates partial update of previous solution block for (int i = colprev;(i - mycol) % npcol != 0;i = (i + npcol - 1) % npcol) { MPI_Status tmpstatus; int recvsize; //fprintfqt(STD_OUT, "Process %d starting receive from %d (buffer %d)\n", GRID->iam, i, Wsize); MPI_Recv(W, Wsize, MPI_DOUBLE, i, Rmsgid, Rcomm, &tmpstatus); MPI_Get_count(&tmpstatus, MPI_DOUBLE, &recvsize); //fprintfqt(STD_OUT, "Process %d: received from %d (%d bytes starting from %d)\n", GRID->iam, i, recvsize, Anpprev - recvsize); HPL_daxpy(recvsize, HPL_rone, W, 1, XC+Anpprev-recvsize, 1); } } //Solve current diagonal block if((mycol == Alcol_process) && (myrow == Alrow)) { HPL_dtrsv(HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, kb, Aptr+Anp, lda, XC+Anp, 1); //fprintfqt(STD_OUT, "Process %d: dtrsv, offset %d\n", GRID->iam, Anp); HPL_dcopy(kb, XC+Anp, 1, XR+Anq, 1); } //Finish previous update if((mycol == colprev) && ((tmp1 = Anpprev - n1pprev ) > 0)) { HPL_dgemv(HplColumnMajor, HplNoTrans, tmp1, kbprev, -HPL_rone, Aprev, lda, Xdprev, 1, HPL_rone, XC, 1); //fprintfqt(STD_OUT, "Process %d: dgemv (%d rows starting from %d, finishing)\n", GRID->iam, tmp1, 0); } //Save info of current step and update info for the next step if (mycol == Alcol_process) { Xdprev = Xd; Aprev = Aptr; } if (myrow == Alrow) { Anpprev -= kb; } n1pprev = n1p; n -= kb; Rmsgid = (Rmsgid+2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid+2); Cmsgid = (Cmsgid+2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV+1 : Cmsgid+2); } rowprev = Alrow; colprev = Alcol_process; kbprev = kb; //Replicate last solution block if (mycol == colprev) { (void) HPL_broadcast((void *) (XR), kbprev, HPL_DOUBLE, rowprev, Ccomm); } if (Wsize) free(W); HPL_ptimer_detail(HPL_TIMING_PTRSV); //End of HPL_pdtrsv }
void HPL_pdpanrlT ( HPL_T_panel * PANEL, const int M, const int N, const int ICOFF, double * WORK ) { /* * Purpose * ======= * * HPL_pdpanrlT factorizes a panel of columns that is a sub-array of a * larger one-dimensional panel A using the Right-looking variant of the * usual one-dimensional algorithm. The lower triangular N0-by-N0 upper * block of the panel is stored in transpose form. * * Bi-directional exchange is used to perform the swap::broadcast * operations at once for one column in the panel. This results in a * lower number of slightly larger messages than usual. On P processes * and assuming bi-directional links, the running time of this function * can be approximated by (when N is equal to N0): * * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + * N0^2 * ( M - N0/3 ) * gam2-3 * * where M is the local number of rows of the panel, lat and bdwth are * the latency and bandwidth of the network for double precision real * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS * rate of execution. The recursive algorithm allows indeed to almost * achieve Level 3 BLAS performance in the panel factorization. On a * large number of modern machines, this operation is however latency * bound, meaning that its cost can be estimated by only the latency * portion N0 * log_2(P) * lat. Mono-directional links will double this * communication cost. * * Note that one iteration of the the main loop is unrolled. The local * computation of the absolute value max of the next column is performed * just after its update by the current column. This allows to bring the * current column only once through cache at each step. The current * implementation does not perform any blocking for this sequence of * BLAS operations, however the design allows for plugging in an optimal * (machine-specific) specialized BLAS-like kernel. This idea has been * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. * * Arguments * ========= * * PANEL (local input/output) HPL_T_panel * * On entry, PANEL points to the data structure containing the * panel information. * * M (local input) const int * On entry, M specifies the local number of rows of sub(A). * * N (local input) const int * On entry, N specifies the local number of columns of sub(A). * * ICOFF (global input) const int * On entry, ICOFF specifies the row and column offset of sub(A) * in A. * * WORK (local workspace) double * * On entry, WORK is a workarray of size at least 2*(4+2*N0). * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ double * A, * Acur, * Anxt, * L1; int Mm1, Nm1, curr, ii, iip1, jj, lda, m=M, n0; /* .. * .. Executable Statements .. */ A = PANEL->A; lda = PANEL->lda; L1 = PANEL->L1; n0 = PANEL->jb; curr = (int)( PANEL->grid->myrow == PANEL->prow ); Nm1 = N - 1; jj = ICOFF; if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; } else { ii = 0; iip1 = ii; Mm1 = m; } /* * Find local absolute value max in first column - initialize WORK[0:3] */ HPL_dlocmax( PANEL, m, ii, jj, WORK ); while( Nm1 >= 1 ) { Acur = Mptr( A, iip1, jj, lda ); Anxt = Mptr( Acur, 0, 1, lda ); /* * Swap and broadcast the current row */ HPL_pdmxswp( PANEL, m, ii, jj, WORK ); HPL_dlocswpT( PANEL, ii, jj, WORK ); /* * Scale current column by its absolute value max entry - Update trai- * ling sub-matrix and find local absolute value max in next column (On- * ly one pass through cache for each current column). This sequence of * operations could benefit from a specialized blocked implementation. */ if( WORK[0] != HPL_rzero ) HPL_dscal( Mm1, HPL_rone / WORK[0], Acur, 1 ); HPL_daxpy( Mm1, -(*(Mptr( L1, jj+1, jj, n0 ))), Acur, 1, Anxt, 1 ); HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK ); if( Nm1 > 1 ) { HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1, Mptr( L1, jj+2, jj, n0 ), 1, Mptr( Anxt, 0, 1, lda ), lda ); } if( curr != 0 ) { ii = iip1; iip1++; m = Mm1; Mm1--; } Nm1--; jj++; } /* * Swap and broadcast last row - Scale last column by its absolute value * max entry */ HPL_pdmxswp( PANEL, m, ii, jj, WORK ); HPL_dlocswpT( PANEL, ii, jj, WORK ); if( WORK[0] != HPL_rzero ) HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 ); /* * End of HPL_pdpanrlT */ }
class_t *new_ooclass(at *classname, at *atsuper, at *new_slots, at *defaults) { ifn (SYMBOLP(classname)) error(NIL, "not a valid class name", classname); ifn (length(new_slots) == length(defaults)) error(NIL, "number of slots must match number of defaults", NIL); ifn (CLASSP(atsuper)) error(NIL, "not a class", atsuper); class_t *super = Mptr(atsuper); if (builtin_class_p(super)) error(NIL, "superclass not descendend of class <object>", atsuper); at *p = new_slots; at *q = defaults; int n = 0; while (CONSP(p) && CONSP(q)) { ifn (SYMBOLP(Car(p))) error(NIL, "not a symbol", Car(p)); for (int i=0; i<super->num_slots; i++) if (Car(p) == super->slots[i]) error(NIL, "a superclass has slot of this name", Car(p)); p = Cdr(p); q = Cdr(q); n++; } assert(!p && !q); /* builds the new class */ class_t *cl = mm_alloc(mt_class); *cl = *object_class; if (new_slots) { int num_slots = n + super->num_slots; cl->slots = mm_allocv(mt_refs, num_slots*sizeof(at *)); cl->defaults = mm_allocv(mt_refs, num_slots*sizeof(at *)); int i = 0; for (; i<super->num_slots; i++) { cl->slots[i] = super->slots[i]; cl->defaults[i] = super->defaults[i]; } for (at *s = new_slots, *d = defaults; CONSP(s); i++, s = Cdr(s), d = Cdr(d)) { cl->slots[i] = Car(s); cl->defaults[i] = Car(d); } assert(i == num_slots); cl->num_slots = num_slots; } else { cl->slots = super->slots; cl->defaults = super->defaults; cl->num_slots = super->num_slots; } /* Sets up classname */ cl->classname = classname; cl->priminame = classname; /* Sets up lists */ cl->super = super; cl->atsuper = atsuper; cl->nextclass = super->subclasses; cl->subclasses = 0L; super->subclasses = cl; /* Initialize the slots */ cl->myslots = new_slots; cl->mydefaults = defaults; /* Initialize the methods and hash table */ cl->methods = NIL; cl->hashtable = 0L; cl->hashsize = 0; cl->hashok = 0; /* Initialize DHCLASS stuff */ cl->has_compiled_part = super->has_compiled_part; cl->num_cslots = super->num_cslots; cl->classdoc = 0; cl->kname = 0; /* Create AT and returns it */ assert(class_class); cl->backptr = new_at(class_class, cl); return cl; }
double * A1, * A2, * L, * Wr0, * Wmx; int ilindx, lda, myrow, n0, nr, nu; register int i; /* .. * .. Executable Statements .. */ myrow = PANEL->grid->myrow; n0 = PANEL->jb; lda = PANEL->lda; Wr0 = ( Wmx = WORK + 4 ) + n0; Wmx[JJ] = gmax = WORK[0]; nu = (int)( ( (unsigned int)(n0) >> HPL_LOCSWP_LOG2_DEPTH ) << HPL_LOCSWP_LOG2_DEPTH ); nr = n0 - nu; /* * Replicated swap and copy of the current (new) row of A into L1 */ L = Mptr( PANEL->L1, 0, JJ, n0 ); /* * If the pivot is non-zero ... */ if( gmax != HPL_rzero ) { /* * and if I own the current row of A ... */ if( myrow == PANEL->prow ) { /* * and if I also own the row to be swapped with the current row of A ... */ if( myrow == (int)(WORK[3]) ) {
void HPL_pdfact ( HPL_T_panel * PANEL ) { /* * Purpose * ======= * * HPL_pdfact recursively factorizes a 1-dimensional panel of columns. * The RPFACT function pointer specifies the recursive algorithm to be * used, either Crout, Left- or Right looking. NBMIN allows to vary the * recursive stopping criterium in terms of the number of columns in the * panel, and NDIV allows to specify the number of subpanels each panel * should be divided into. Usuallly a value of 2 will be chosen. Finally * PFACT is a function pointer specifying the non-recursive algorithm to * to be used on at most NBMIN columns. One can also choose here between * Crout, Left- or Right looking. Empirical tests seem to indicate that * values of 4 or 8 for NBMIN give the best results. * * Bi-directional exchange is used to perform the swap::broadcast * operations at once for one column in the panel. This results in a * lower number of slightly larger messages than usual. On P processes * and assuming bi-directional links, the running time of this function * can be approximated by (when N is equal to N0): * * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + * N0^2 * ( M - N0/3 ) * gam2-3 * * where M is the local number of rows of the panel, lat and bdwth are * the latency and bandwidth of the network for double precision real * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS * rate of execution. The recursive algorithm allows indeed to almost * achieve Level 3 BLAS performance in the panel factorization. On a * large number of modern machines, this operation is however latency * bound, meaning that its cost can be estimated by only the latency * portion N0 * log_2(P) * lat. Mono-directional links will double this * communication cost. * * Arguments * ========= * * PANEL (local input/output) HPL_T_panel * * On entry, PANEL points to the data structure containing the * panel information. * * --------------------------------------------------------------------- */ /* * .. Local Variables .. */ void * vptr = NULL; int align, jb; /* .. * .. Executable Statements .. */ jb = PANEL->jb; PANEL->n -= jb; PANEL->ja += jb; if( ( PANEL->grid->mycol != PANEL->pcol ) || ( jb <= 0 ) ) return; HPL_ptimer_detail( HPL_TIMING_RPFACT ); VT_USER_START_A("Factorization"); align = PANEL->algo->align; vptr = (void *)malloc( ( (size_t)(align) + (size_t)(((4+((unsigned int)(jb) << 1)) << 1) )) * sizeof(double) ); if( vptr == NULL ) { HPL_pabort( __LINE__, "HPL_pdfact", "Memory allocation failed" ); } /* * Factor the panel - Update the panel pointers */ PANEL->algo->rffun( PANEL, PANEL->mp, jb, 0, (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) ) ); if( vptr ) free( vptr ); PANEL->A = Mptr( PANEL->A, 0, jb, PANEL->lda ); PANEL->nq -= jb; PANEL->jj += jb; VT_USER_END_A("Factorization"); HPL_ptimer_detail( HPL_TIMING_RPFACT ); /* * End of HPL_pdfact */ }
void HPL_pdpanel_init ( HPL_T_grid * GRID, HPL_T_palg * ALGO, const int M, const int N, const int JB, const int NB, HPL_T_pmat * A, const int IA, const int JA, const int TAG, HPL_T_panel * PANEL ) { /* * Purpose * ======= * * HPL_pdpanel_init initializes a panel data structure. * * * Arguments * ========= * * GRID (local input) HPL_T_grid * * On entry, GRID points to the data structure containing the * process grid information. * * ALGO (global input) HPL_T_palg * * On entry, ALGO points to the data structure containing the * algorithmic parameters. * * M (local input) const int * On entry, M specifies the global number of rows of the panel. * M must be at least zero. * * N (local input) const int * On entry, N specifies the global number of columns of the * panel and trailing submatrix. N must be at least zero. * * JB (global input) const int * On entry, JB specifies is the number of columns of the panel. * JB must be at least zero. * * A (local input/output) HPL_T_pmat * * On entry, A points to the data structure containing the local * array information. * * IA (global input) const int * On entry, IA is the global row index identifying the panel * and trailing submatrix. IA must be at least zero. * * JA (global input) const int * On entry, JA is the global column index identifying the panel * and trailing submatrix. JA must be at least zero. * * TAG (global input) const int * On entry, TAG is the row broadcast message id. * * PANEL (local input/output) HPL_T_panel * * On entry, PANEL points to the data structure containing the * panel information. * * --------------------------------------------------------------------- */ START_TRACE( PDPANEL_INIT ) /* * .. Local Variables .. */ size_t dalign; int icurcol, icurrow, ii, itmp1, jj, lwork, ml2, mp, mycol, myrow, npcol, nprow, nq, nu; int i; /* .. * .. Executable Statements .. */ PANEL->grid = GRID; /* ptr to the process grid */ PANEL->algo = ALGO; /* ptr to the algo parameters */ PANEL->pmat = A; /* ptr to the local array info */ myrow = GRID->myrow; mycol = GRID->mycol; nprow = GRID->nprow; npcol = GRID->npcol; HPL_infog2l( IA, JA, NB, NB, 0, 0, myrow, mycol, nprow, npcol, &ii, &jj, &icurrow, &icurcol, GRID ); mp = HPL_numrowI( M, IA, NB, myrow, nprow ); nq = HPL_numcolI( N, JA, NB, mycol, GRID ); /* ptr to trailing part of A */ PANEL->A = Mptr( (double *)(A->A), ii, jj, A->ld ); /* * Workspace pointers are initialized to NULL. */ PANEL->L2 = NULL; PANEL->L1 = NULL; PANEL->DPIV = NULL; PANEL->DINFO = NULL; PANEL->U = NULL; /* * Local lengths, indexes process coordinates */ PANEL->nb = NB; /* distribution blocking factor */ PANEL->jb = JB; /* panel width */ PANEL->m = M; /* global # of rows of trailing part of A */ PANEL->n = N; /* global # of cols of trailing part of A */ PANEL->ia = IA; /* global row index of trailing part of A */ PANEL->ja = JA; /* global col index of trailing part of A */ PANEL->mp = mp; /* local # of rows of trailing part of A */ PANEL->nq = nq; /* local # of cols of trailing part of A */ PANEL->ii = ii; /* local row index of trailing part of A */ PANEL->jj = jj; /* local col index of trailing part of A */ PANEL->lda = A->ld; /* local leading dim of array A */ PANEL->prow = icurrow; /* proc row owning 1st row of trailing A */ PANEL->pcol = icurcol; /* proc col owning 1st col of trailing A */ PANEL->msgid = TAG; /* message id to be used for panel bcast */ /* * Initialize ldl2 and len to temporary dummy values and Update tag for * next panel */ PANEL->ldl2 = 0; /* local leading dim of array L2 */ PANEL->len = 0; /* length of the buffer to broadcast */ /* * Figure out the exact amount of workspace needed by the factorization * and the update - Allocate that space - Finish the panel data structu- * re initialization. * * L1: JB x JB in all processes * DPIV: JB in all processes * DINFO: 1 in all processes * * We make sure that those three arrays are contiguous in memory for the * later panel broadcast. We also choose to put this amount of space * right after L2 (when it exist) so that one can receive a contiguous * buffer. */ dalign = ALGO->align * sizeof( double ); if( npcol == 1 ) /* P x 1 process grid */ { /* space for L1, DPIV, DINFO */ lwork = ALGO->align + ( PANEL->len = JB * JB + JB + 1 ); if( nprow > 1 ) /* space for U */ { nu = nq - JB; if (nu % 8) nu += 8 - nu % 8; if (nu % 16 == 0) nu += 8; lwork += JB * Mmax( 0, nu ) + ALGO->align; } //printf("WORK1 %d of %d\n", (int) lwork, (int) panel_max_lwork); if (lwork > PANEL->memalloc) { for (i = 0;i < PANEL_PREALLOC_COUNT;i++) { if (p_lwork[i] != NULL && lwork <= panel_max_lwork) { PANEL->WORK = p_lwork[i]; PANEL->memalloc = panel_max_lwork; p_lwork[i] = NULL; break; } } if (i == PANEL_PREALLOC_COUNT) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory"); } /* if (PANEL->WORK) { CALDGEMM_free(PANEL->WORK); fprintf(STD_OUT, "WARNING, reallocating Panel memory\n"); } if( !( PANEL->WORK = (void *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( double ), 0) ) ) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); } PANEL->memalloc = lwork;*/ } /* * Initialize the pointers of the panel structure - Always re-use A in * the only process column */ PANEL->L2 = PANEL->A + ( myrow == icurrow ? JB : 0 ); PANEL->ldl2 = A->ld; PANEL->L1 = (double *)HPL_PTR( PANEL->WORK, dalign ); PANEL->DPIV = PANEL->L1 + JB * JB; PANEL->DINFO = PANEL->DPIV + JB; *(PANEL->DINFO) = 0.0; PANEL->U = ( nprow > 1 ? (double*) HPL_PTR( (PANEL->DINFO + 1), dalign ) : NULL ); } else { /* space for L2, L1, DPIV */ ml2 = ( myrow == icurrow ? mp - JB : mp ); ml2 = Mmax( 0, ml2 ); PANEL->len = ml2*JB + ( itmp1 = JB*JB + JB + 1 ); #ifdef HPL_COPY_L lwork = ALGO->align + PANEL->len; #else lwork = ALGO->align + ( mycol == icurcol ? itmp1 : PANEL->len ); #endif if( nprow > 1 ) /* space for U */ { nu = ( mycol == icurcol ? nq - JB : nq ); if (nu % 8) nu += 8 - nu % 8; if (nu % 16 == 0) nu += 8; lwork += JB * Mmax( 0, nu ) + ALGO->align; } //printf("WORK2 %d of %d\n", (int) lwork, (int) panel_max_lwork); if (lwork > PANEL->memalloc) { for (i = 0;i < PANEL_PREALLOC_COUNT;i++) { if (p_lwork[i] != NULL && lwork <= panel_max_lwork) { PANEL->WORK = p_lwork[i]; PANEL->memalloc = panel_max_lwork; p_lwork[i] = NULL; break; } } if (i == PANEL_PREALLOC_COUNT) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory"); } /* if (PANEL->WORK) { CALDGEMM_free(PANEL->WORK); fprintf(STD_OUT, "WARNING, reallocating Panel memory\n"); } if( !( PANEL->WORK = (void *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( double ),0 ) ) ) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); } PANEL->memalloc = lwork;*/ } /* * Initialize the pointers of the panel structure - Re-use A in the cur- * rent process column when HPL_COPY_L is not defined. */ #ifdef HPL_COPY_L PANEL->L2 = (double *)HPL_PTR( PANEL->WORK, dalign ); PANEL->ldl2 = Mmax( 1, ml2 ); PANEL->L1 = PANEL->L2 + ml2 * JB; #else if( mycol == icurcol ) { PANEL->L2 = PANEL->A + ( myrow == icurrow ? JB : 0 ); PANEL->ldl2 = A->ld; PANEL->L1 = (double *)HPL_PTR( PANEL->WORK, dalign ); } else { PANEL->L2 = (double *)HPL_PTR( PANEL->WORK, dalign ); PANEL->ldl2 = Mmax( 1, ml2 ); PANEL->L1 = PANEL->L2 + ml2 * JB; } #endif PANEL->DPIV = PANEL->L1 + JB * JB; PANEL->DINFO = PANEL->DPIV + JB; *(PANEL->DINFO) = 0.0; if (nprow > 1) { PANEL->U = (double *)HPL_PTR( (PANEL->DINFO + 1), dalign ); } } /* * If nprow is 1, we just allocate an array of JB integers for the swap. * When nprow > 1, we allocate the space for the index arrays immediate- * ly. The exact size of this array depends on the swapping routine that * will be used, so we allocate the maximum: * * IWORK[0] is of size at most 1 + * IPL is of size at most 1 + * IPID is of size at most 4 * JB + * * For HPL_pdlaswp00: * lindxA is of size at most 2 * JB + * lindxAU is of size at most 2 * JB + * llen is of size at most NPROW + * llen_sv is of size at most NPROW. * * For HPL_pdlaswp01: * ipA is of size ar most 1 + * lindxA is of size at most 2 * JB + * lindxAU is of size at most 2 * JB + * iplen is of size at most NPROW + 1 + * ipmap is of size at most NPROW + * ipmapm1 is of size at most NPROW + * permU is of size at most JB + * iwork is of size at most MAX( 2*JB, NPROW+1 ). * * that is 3 + 8*JB + MAX(2*NPROW, 3*NPROW+1+JB+MAX(2*JB,NPROW+1)) * = 4 + 9*JB + 3*NPROW + MAX( 2*JB, NPROW+1 ). * * We use the fist entry of this to work array to indicate whether the * the local index arrays have already been computed, and if yes, by * which function: * IWORK[0] = -1: no index arrays have been computed so far; * IWORK[0] = 0: HPL_pdlaswp00 already computed those arrays; * IWORK[0] = 1: HPL_pdlaswp01 already computed those arrays; * This allows to save some redundant and useless computations. */ if( nprow == 1 ) { lwork = JB; } else { itmp1 = (JB << 1); lwork = nprow + 1; itmp1 = Mmax( itmp1, lwork ); lwork = 4 + (9 * JB) + (3 * nprow) + itmp1; } //printf("IWORK3 %d of %d\n", (int) lwork, (int) panel_max_ilwork); if (lwork > PANEL->memallocI) { /*if (PANEL->IWORK) { CALDGEMM_free(PANEL->IWORK); fprintf(STD_OUT, "WARNING, reallocating Panel memory\n"); } PANEL->IWORK = (int *) CALDGEMM_alloc( (size_t)(lwork) * sizeof( int ), 0 ); PANEL->memallocI = lwork;*/ for (i = 0;i < PANEL_PREALLOC_COUNT;i++) { if (p_ilwork[i] != NULL && lwork <= panel_max_ilwork) { PANEL->IWORK = (int*) p_ilwork[i]; PANEL->memallocI = panel_max_ilwork; p_ilwork[i] = NULL; break; } } if (i == PANEL_PREALLOC_COUNT) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Problem with preallocated panel memory"); } } if( PANEL->IWORK == NULL ) { HPL_pabort( __LINE__, "HPL_pdpanel_init", "Memory allocation failed" ); } /* Initialize the first entry of the workarray */ *(PANEL->IWORK) = -1; END_TRACE /* * End of HPL_pdpanel_init */ }