示例#1
0
void ParticleSystem::init(const MPQFile& f, const ModelParticleEmitterDef &mta, int *globals)
{
	speed.init(mta.EmissionSpeed, f, globals);
	variation.init(mta.SpeedVariation, f, globals);
	spread.init(mta.VerticalRange, f, globals);
	lat.init(mta.HorizontalRange, f, globals);
	gravity.init(mta.Gravity, f, globals);
	lifespan.init(mta.Lifespan, f, globals);
	rate.init(mta.EmissionRate, f, globals);
	areal.init(mta.EmissionAreaLength, f, globals);
	areaw.init(mta.EmissionAreaWidth, f, globals);
	deacceleration.init(mta.Gravity2, f, globals);
	enabled.init(mta.en, f, globals);

	Vec3D colors2[3];
	memcpy(colors2, f.getBuffer() + mta.p.colors.ofsKeys, sizeof(Vec3D) * 3);
	for (size_t i = 0; i<3; ++i) {
		float opacity = *reinterpret_cast<int16_t*>(f.getBuffer() + mta.p.opacity.ofsKeys + i * 2);
		colors[i] = Vec4D(colors2[i].x / 255.0f, colors2[i].y / 255.0f, colors2[i].z / 255.0f, opacity / 32767.0f);
		sizes[i] = (*reinterpret_cast<float*>(f.getBuffer() + mta.p.sizes.ofsKeys + i * 4))*mta.p.scales[i];
	}
	mid = 0.5;
	slowdown = mta.p.slowdown;
	rotation = mta.p.rotation;
	pos = fixCoordSystem(mta.pos);
	_texture = model->_textures[mta.texture];
	blend = mta.blend;
	rows = mta.rows;
	cols = mta.cols;
	type = mta.ParticleType;
	//order = mta.s2;
	order = mta.ParticleType>0 ? -1 : 0;
	parent = model->bones + mta.bone;

	switch (mta.EmitterType) {
	case 1:
		emitter = new PlaneParticleEmitter(this);
		break;
	case 2:
		emitter = new SphereParticleEmitter(this);
		break;
	}

	//transform = mta.flags & 1024;

	billboard = !(mta.flags & 4096);

	manim = mtime = 0;
	rem = 0;

	tofs = misc::frand();

	// init tiles
	for (int i = 0; i<rows*cols; ++i) {
		TexCoordSet tc;
		initTile(tc.tc, i);
		tiles.push_back(tc);
	}
}
示例#2
0
void ParticleSystem::init(MPQFile &f, ModelParticleEmitterDef &mta, int *globals)
{
	speed.init	 (mta.params[0], f, globals);
	variation.init(mta.params[1], f, globals);
	spread.init	 (mta.params[2], f, globals);
	lat.init	 (mta.params[3], f, globals);
	gravity.init (mta.params[4], f, globals);
	lifespan.init(mta.params[5], f, globals);
	rate.init	 (mta.params[6], f, globals);
	areal.init	 (mta.params[7], f, globals);
	areaw.init	 (mta.params[8], f, globals);
	grav2.init	 (mta.params[9], f, globals);

	for (size_t i=0; i<3; i++) {
		colors[i] = fromARGB(mta.p.colors[i]);
		sizes[i] = mta.p.sizes[i];// * mta.p.scales[i];
	}
	mid = mta.p.mid;
	slowdown = mta.p.slowdown;
	rotation = mta.p.rotation;
	pos = fixCoordSystem(mta.pos);
	texture = model->textures[mta.texture];
	blend = mta.blend;
	rows = mta.rows;
	cols = mta.cols;
	type = mta.s1;
	//order = mta.s2;
	order = mta.s1>0 ? -1 : 0;
	parent = model->bones + mta.bone;

	switch (mta.type) {
	case 1:
		emitter = new PlaneParticleEmitter(this);
		break;
	case 2:
		emitter = new SphereParticleEmitter(this);
		break;
	}

	//transform = mta.flags & 1024;

	billboard = !(mta.flags & 4096);

	manim = mtime = 0;
	rem = 0;

	tofs = frand();

	// init tiles
	for (int i=0; i<rows*cols; i++) {
		TexCoordSet tc;
		initTile(tc.tc,i);
		tiles.push_back(tc);
	}
}
ossimRefPtr<ossimImageData> ossimPointCloudImageHandler::getTile(const ossimIrect& tile_rect,
                                                                 ossim_uint32 resLevel)
{
   if (!m_tile.valid())
      initTile();

   // Image rectangle must be set prior to calling getTile.
   m_tile->setImageRectangle(tile_rect);
   if (getTile(m_tile.get(), resLevel) == false)
   {
      if (m_tile->getDataObjectStatus() != OSSIM_NULL)
         m_tile->makeBlank();
   }

   return m_tile;
}
示例#4
0
bool ossimLasReader::init()
{
   bool result = false;

   if ( isOpen() )
   {
      result = parseVarRecords();

      if ( !result )
      {
         result = initFromExternalMetadata(); // Checks for external FGDC text file.
      }

      // There is nothing we can do if parseVarRecords fails.
      if ( result )
      {
         initTile();
      }
   }
   
   return result;
}
示例#5
0
void
initDefaultTiles(
    BlasGenSettings *gset,
    BlasFunctionID funcID,
    TileCreationFlags flags,
    PrivateStorageType storType)
{
    const SubproblemDim *dim = &gset->subdims[1];
    KernelExtraFlags kflags = gset->kextra->flags;
    DataType dtype = gset->kextra->dtype;
    Tile *tile;
    const char *name;
    int level;
    bool packed;

    level = funcBlasLevel(funcID);
    packed = ((flags & TILE_PACKED) != 0);

    tile = &gset->tileA;
    selectTileBaseName(tile, "a");
    initTile(tile, tile->baseName, (unsigned int)dim->y,
             (unsigned int)dim->bwidth, 1, dtype, storType, false, packed);

    tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_A);
    if (!(gset->flags & BGF_WHOLE_A)) {
        if (tile->trans) {
            tile->nrCols = 1;
        }
        else {
            tile->nrRows = 1;
        }
    }
    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_A);

    tile = &gset->tileBX;
    name = (level == 2) ? "x" : "b";
    selectTileBaseName(tile, name);
    initTile(tile, tile->baseName, (unsigned int)dim->bwidth,
             (unsigned int)dim->x, 1, dtype, storType, false, packed);

    /*
     * NOTE: Tiles for the level 2 functions are forced to be transposed
     *       in order to allow user to fetch elements belonging to different
     *       rows which is very useful in case of unit increment between
     *       elements because provides faster access to the global memory.
     */
    if (level == 2) {
        tile->trans = true;
    }
    else {
        tile->trans = !isMatrixAccessColMaj(funcID, kflags, MATRIX_B);
    }
    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_B);

    tile = &gset->tileCY;
    name = (level == 2) ? "y" : "c";
    selectTileBaseName(tile, name);

    initTile(tile, tile->baseName, (unsigned int)dim->y,
             (unsigned int)dim->x, 1, dtype, storType, false,
             packed);

    if (level == 2) {
        tile->trans = true;
    }
    else if (!(flags & TILE_C_FORCE_NOTRANS)) {
        tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_C);
    }
    selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_C);

    // FIXME: remove the restriction
    /*if (isComplexType(tile->dtype)) {
        tile->vecLen = 1;
    }*/
}
示例#6
0
static void
initTiles(
    BlasGenSettings* gset,
    TileSet* tileSet,
    const struct SubproblemDim *subdims,
    KernelExtraFlags kflags,
    DataType dtype,
    PrivateStorageType storType)
{
    unsigned int rowsA;
    unsigned int rowsB;
    unsigned int rowsC;
    unsigned int colsA;
    unsigned int colsB;
    unsigned int colsC;
    bool transA;
    bool transB;
    unsigned int vecLenA;
    unsigned int vecLenB;
    unsigned int vecLenC;

    rowsA = (unsigned int)subdims[1].y;
    colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);

    rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth);
    colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y);

    rowsC = (unsigned int)subdims[1].y;
    colsC = (unsigned int)subdims[1].x;

    transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A);
    transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B);

    vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth);
    vecLenA = umin(vecLenA, MAX_TILE_VECLEN);
    vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth);
    vecLenB = umin(vecLenB, MAX_TILE_VECLEN);
    vecLenC = (transB) ? vecLenB : vecLenA;

    initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y,
             (unsigned int)subdims[1].bwidth, vecLenA, dtype,
             storType, transA, false);

    initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y,
             (unsigned int)subdims[1].y, vecLenA, dtype, storType,
             transA, false);

    initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth,
             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
             !transB, false);

    initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y,
             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
             !transB, false);

    initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y,
             (unsigned int)subdims[1].y, vecLenB, dtype, storType,
             transA, false);

    initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y,
             (unsigned int)subdims[1].x, vecLenB, dtype, storType,
             gset->tileCY.trans, false);

    initTile(&gset->tileA, "a", rowsA, colsA,
             vecLenA, dtype, storType, transA, false);

    initTile(&gset->tileBX, "b", rowsB, colsB,
             vecLenB, dtype, storType, !transB, false);

    initTile(&gset->tileCY, "c", rowsC, colsC,
             vecLenC, dtype, storType, !transB, false);

    tileSet->A = gset->tileA;
    tileSet->B = gset->tileBX;
}
示例#7
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
    char tmp[2048];
    const char *typeName;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset, tgset, lset, gset1;
    CLBLASKernExtra kextraTmp;
    TileMulOpts mulOpts, tmulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    size_t vecLen = kextra->vecLen;
    const char *outTypeName;
    bool b;
    TilePostFetchPrivate pfPriv;
    struct symvPrivate priv;
    size_t wgSize;
    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
    bool isComplex = isComplexType(dtype);
    Tile tileb;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
    unsigned int cLocal;
    unsigned int nPlans;

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&pfPriv, 0, sizeof(pfPriv));
    memset(&priv, 0, sizeof(priv));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);

    declareSymvKernel(ctx, dtype, pgran, kflags);

    ret = kgenBeginFuncBody(ctx);
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    kgenAddStmt(ctx, "#define M actualN\n");
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;

    gset.kextra = kextra;
    gset.pgran = pgran;

    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);

    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    cLocal = wgSize / bStep;
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
                 gid, cLocal, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint n = coordA;\n");
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);
    kgenAddStmt(ctx, "actualN += startN;\n");

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "uint k = k0;\n");

    if (tailK) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, "if (n + Ktail < N)");
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        vnames->sizeM = "N";
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "N";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (rowMaj) {
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
    }
    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    upper ^= rowMaj;
    tra ^= rowMaj;
    if (upper ^ rowMaj && tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    gset.tileA.trans ^= !upper;
    tgset = gset;
    tmulOpts = mulOpts;

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK)
    {
            kextraTmp = *kextra;
            gset1 = gset;

            kextraTmp.vecLen = 1;
            gset1.kextra = &kextraTmp;

            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;

            gset1.tileBX.nrRows = 1;
            gset1.tileA.nrCols = 1;
            kextraTmp.vecLenA = 1;
    }


    if (isHoriz)
    {
        lset = gset;
        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
        lset.tileA.nrCols = lset.tileA.nrRows =
            lset.tileBX.nrRows = lset.subdims[1].y;

        kgenAddStmt(ctx, "// the diagonal\n");
        kgenBeginBranch(ctx, "if (k <= n)");
        kgenAddStmt(ctx, "uint k1 = k;\n");

        if (subdims[1].bwidth != subdims[1].y) {
            kgenAddStmt(ctx, "// the pred diagonal\n");
            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */
        }

        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
            lset.tileA.trans, lset.tileA.packed);
        declareOneTileStorage(ctx, &tileb);

        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                mulOpts.flags, kflags);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.tilea = lset.tileA;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        lset.tileA = tileb;
        mulOpts.postFetch = genPostFetchMirror;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &lset, &mulOpts);
        if (ret != 0) {
            return ret;
        }

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans = lset.tileA.trans ^= true;
        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        if (subdims[1].bwidth != subdims[1].y) {
            size_t width = umax(subdims[1].bwidth, subdims[1].y);
            kgenAddStmt(ctx, "// the post diagonal\n");
            if (tailK) {
                kgenBeginBranch(ctx, "if(k < N)");
            }
            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
                    lset.subdims[1].bwidth,
                    width, width, width,
                    lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */

            if (tailK) {
                kgenEndBranch(ctx, NULL);
                kgenBeginBranch(ctx, "else");
                /* Handle tail along vector X */

                kgenAddStmt(ctx, "N += Ntail;\n");

                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
                    lset.subdims[1].bwidth);
                kgenBeginBranch(ctx, tmp);

                gset1.tileA.trans = gset.tileA.trans;

                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset1, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
#endif

                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
                kgenEndBranch(ctx, NULL);
            }
        }

        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL);
    }
    else
    {

        kgenAddStmt(ctx, "// the diagonal\n");
        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

        kgenBeginBranch(ctx, NULL);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &gset;
        mulOpts.postFetch = genPostFetchVertDiag;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans ^= true;
        lset = gset;

        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, NULL);

        priv.diag = false;
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL); /* if */
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
    }
    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");

        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
        sprintf(tmp, "for (; k < N; k++)");
        kgenBeginBranch(ctx, tmp);

        gset1.tileA.trans = gset.tileA.trans;

        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset1, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
#endif

        kgenEndBranch(ctx, NULL);

        kgenBeginBranch(ctx, "else");

        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        pfPriv.gset = &tgset;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &tgset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        if (isHoriz) {
            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
        }
        else {
            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        }
        kgenAddStmt(ctx, tmp);

        kgenEndBranch(ctx, NULL); /* k loop */

        kgenBeginBranch(ctx, "if (k < N)");
        if (isHoriz) {
            kgenAddStmt(ctx, "k = n;\n");
        }
        else {
            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
            kgenAddStmt(ctx, tmp);
        }

        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        if (!isHoriz) {
            if (upper ^ rowMaj && tra) {
                tmulOpts.flags &= ~TILEMUL_TRA;
            }
            else {
                tmulOpts.flags |= TILEMUL_TRA;
            }
            kgenAddStmt(ctx, "Ktail = N - n;\n");
            priv.coord = true;
        }
        else {
            priv.coord = false;
        }
        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;


        ret = tileMulGen(ctx, &lset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        kgenEndBranch(ctx, NULL);

        kgenEndBranch(ctx, NULL);
    }


    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    sprintf(tmp,"(%s - startN)", vnames->coordA);
    setResultPos(ctx, kflags, tmp);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);
    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}