void ParticleSystem::init(const MPQFile& f, const ModelParticleEmitterDef &mta, int *globals) { speed.init(mta.EmissionSpeed, f, globals); variation.init(mta.SpeedVariation, f, globals); spread.init(mta.VerticalRange, f, globals); lat.init(mta.HorizontalRange, f, globals); gravity.init(mta.Gravity, f, globals); lifespan.init(mta.Lifespan, f, globals); rate.init(mta.EmissionRate, f, globals); areal.init(mta.EmissionAreaLength, f, globals); areaw.init(mta.EmissionAreaWidth, f, globals); deacceleration.init(mta.Gravity2, f, globals); enabled.init(mta.en, f, globals); Vec3D colors2[3]; memcpy(colors2, f.getBuffer() + mta.p.colors.ofsKeys, sizeof(Vec3D) * 3); for (size_t i = 0; i<3; ++i) { float opacity = *reinterpret_cast<int16_t*>(f.getBuffer() + mta.p.opacity.ofsKeys + i * 2); colors[i] = Vec4D(colors2[i].x / 255.0f, colors2[i].y / 255.0f, colors2[i].z / 255.0f, opacity / 32767.0f); sizes[i] = (*reinterpret_cast<float*>(f.getBuffer() + mta.p.sizes.ofsKeys + i * 4))*mta.p.scales[i]; } mid = 0.5; slowdown = mta.p.slowdown; rotation = mta.p.rotation; pos = fixCoordSystem(mta.pos); _texture = model->_textures[mta.texture]; blend = mta.blend; rows = mta.rows; cols = mta.cols; type = mta.ParticleType; //order = mta.s2; order = mta.ParticleType>0 ? -1 : 0; parent = model->bones + mta.bone; switch (mta.EmitterType) { case 1: emitter = new PlaneParticleEmitter(this); break; case 2: emitter = new SphereParticleEmitter(this); break; } //transform = mta.flags & 1024; billboard = !(mta.flags & 4096); manim = mtime = 0; rem = 0; tofs = misc::frand(); // init tiles for (int i = 0; i<rows*cols; ++i) { TexCoordSet tc; initTile(tc.tc, i); tiles.push_back(tc); } }
void ParticleSystem::init(MPQFile &f, ModelParticleEmitterDef &mta, int *globals) { speed.init (mta.params[0], f, globals); variation.init(mta.params[1], f, globals); spread.init (mta.params[2], f, globals); lat.init (mta.params[3], f, globals); gravity.init (mta.params[4], f, globals); lifespan.init(mta.params[5], f, globals); rate.init (mta.params[6], f, globals); areal.init (mta.params[7], f, globals); areaw.init (mta.params[8], f, globals); grav2.init (mta.params[9], f, globals); for (size_t i=0; i<3; i++) { colors[i] = fromARGB(mta.p.colors[i]); sizes[i] = mta.p.sizes[i];// * mta.p.scales[i]; } mid = mta.p.mid; slowdown = mta.p.slowdown; rotation = mta.p.rotation; pos = fixCoordSystem(mta.pos); texture = model->textures[mta.texture]; blend = mta.blend; rows = mta.rows; cols = mta.cols; type = mta.s1; //order = mta.s2; order = mta.s1>0 ? -1 : 0; parent = model->bones + mta.bone; switch (mta.type) { case 1: emitter = new PlaneParticleEmitter(this); break; case 2: emitter = new SphereParticleEmitter(this); break; } //transform = mta.flags & 1024; billboard = !(mta.flags & 4096); manim = mtime = 0; rem = 0; tofs = frand(); // init tiles for (int i=0; i<rows*cols; i++) { TexCoordSet tc; initTile(tc.tc,i); tiles.push_back(tc); } }
ossimRefPtr<ossimImageData> ossimPointCloudImageHandler::getTile(const ossimIrect& tile_rect, ossim_uint32 resLevel) { if (!m_tile.valid()) initTile(); // Image rectangle must be set prior to calling getTile. m_tile->setImageRectangle(tile_rect); if (getTile(m_tile.get(), resLevel) == false) { if (m_tile->getDataObjectStatus() != OSSIM_NULL) m_tile->makeBlank(); } return m_tile; }
bool ossimLasReader::init() { bool result = false; if ( isOpen() ) { result = parseVarRecords(); if ( !result ) { result = initFromExternalMetadata(); // Checks for external FGDC text file. } // There is nothing we can do if parseVarRecords fails. if ( result ) { initTile(); } } return result; }
void initDefaultTiles( BlasGenSettings *gset, BlasFunctionID funcID, TileCreationFlags flags, PrivateStorageType storType) { const SubproblemDim *dim = &gset->subdims[1]; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; Tile *tile; const char *name; int level; bool packed; level = funcBlasLevel(funcID); packed = ((flags & TILE_PACKED) != 0); tile = &gset->tileA; selectTileBaseName(tile, "a"); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->bwidth, 1, dtype, storType, false, packed); tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); if (!(gset->flags & BGF_WHOLE_A)) { if (tile->trans) { tile->nrCols = 1; } else { tile->nrRows = 1; } } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_A); tile = &gset->tileBX; name = (level == 2) ? "x" : "b"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->bwidth, (unsigned int)dim->x, 1, dtype, storType, false, packed); /* * NOTE: Tiles for the level 2 functions are forced to be transposed * in order to allow user to fetch elements belonging to different * rows which is very useful in case of unit increment between * elements because provides faster access to the global memory. */ if (level == 2) { tile->trans = true; } else { tile->trans = !isMatrixAccessColMaj(funcID, kflags, MATRIX_B); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_B); tile = &gset->tileCY; name = (level == 2) ? "y" : "c"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->x, 1, dtype, storType, false, packed); if (level == 2) { tile->trans = true; } else if (!(flags & TILE_C_FORCE_NOTRANS)) { tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_C); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_C); // FIXME: remove the restriction /*if (isComplexType(tile->dtype)) { tile->vecLen = 1; }*/ }
static void initTiles( BlasGenSettings* gset, TileSet* tileSet, const struct SubproblemDim *subdims, KernelExtraFlags kflags, DataType dtype, PrivateStorageType storType) { unsigned int rowsA; unsigned int rowsB; unsigned int rowsC; unsigned int colsA; unsigned int colsB; unsigned int colsC; bool transA; bool transB; unsigned int vecLenA; unsigned int vecLenB; unsigned int vecLenC; rowsA = (unsigned int)subdims[1].y; colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y); rowsC = (unsigned int)subdims[1].y; colsC = (unsigned int)subdims[1].x; transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth); vecLenA = umin(vecLenA, MAX_TILE_VECLEN); vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth); vecLenB = umin(vecLenB, MAX_TILE_VECLEN); vecLenC = (transB) ? vecLenB : vecLenA; initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].bwidth, vecLenA, dtype, storType, transA, false); initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenA, dtype, storType, transA, false); initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenB, dtype, storType, transA, false); initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, gset->tileCY.trans, false); initTile(&gset->tileA, "a", rowsA, colsA, vecLenA, dtype, storType, transA, false); initTile(&gset->tileBX, "b", rowsB, colsB, vecLenB, dtype, storType, !transB, false); initTile(&gset->tileCY, "c", rowsC, colsC, vecLenC, dtype, storType, !transB, false); tileSet->A = gset->tileA; tileSet->B = gset->tileBX; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }