Exemplo n.º 1
0
static void
genPreloadedTileMul(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    const Tile *parTile,
    const char* copy2LDSFuncName)
{
    char tmp[1024];
    KernelExtraFlags kflags = gset->kextra->flags;
    unsigned int bwidthOld;
    const char *oldNameB;
    const char *ptrName;

    getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName);
    kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName);
    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);

    if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
        sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    else {
        sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    kgenAddStmt(ctx, tmp);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "lB = lBMain;\n\n");

    mulOpts->memB = CLMEM_LOCAL_MEMORY;
    oldNameB = gset->varNames.B;
    bwidthOld = (unsigned int)gset->subdims[0].bwidth;
    gset->varNames.B = "lB";
    gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows :
                                                 parTile->nrCols;

    tileMulGen(ctx, gset, mulOpts);

    gset->varNames.B = oldNameB;
    gset->subdims[0].bwidth = bwidthOld;
    mulOpts->memB = CLMEM_GLOBAL_MEMORY;
}
Exemplo n.º 2
0
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    char tmp[1024];
    struct KgenContext *ctx;
    ssize_t ret;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    DataType dtype = kextra->dtype;
    KernelExtraFlags kflags = kextra->flags;
    CLBLASKernExtra extraNew;
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    const char *ptrName;
    UpdateResultFlags upFlags = 0;
    TilePostFetchPrivate pfPriv;
    unsigned int l1Pans;
    bool b;
    Tile parTile;
    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
    int ldsLarge, lds_diagonal;
    bool isInline;
    TileSet tileSet;
    char copy2LDSFuncName[FUNC_NAME_MAXLEN];
    TailStatus tailStatus = 0;
    FetchAddrMode addrMode = 0;
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    bool tailN = ((kflags & KEXTRA_TAILS_N) != 0);
    size_t alignK;

    if (pgran->wgDim != 1) {
        return -EINVAL;
    }

    l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);

    memset(&gset, 0, sizeof(gset));
    gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS;
    memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2);
    // there is not need in block structure along K
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    subdims = gset.subdims;

    /*
     * Since tiles are changed dynamically, e. g. in the main tilemul
     * loop they are rectangular, but at the second stage both A and B
     * tile storages are used for square tiles. One must adjust physical
     * vectorization accordindly, so as vector length might not be
     * greater than linear size of any tile
     */
    memcpy(&extraNew, kextra, sizeof(extraNew));
    extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y);
    extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y);

    gset.pgran = pgran;
    gset.kextra = &extraNew;
    initKernelVarNames(&gset.varNames);

    // multiplication options
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
    mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD :
                                                         TILEMUL_MULADD;
    mulOpts.postFetch = NULL;
    mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
    mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K;
    mulOpts.fctx = createFetchContext();
    if (mulOpts.fctx == NULL) {
        return -ENOMEM;
    }

    disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING);

    isInline = (gset.flags & BGF_EXPLICIT_INLINE);

    initTiles(&gset, &tileSet, subdims, kflags, dtype,
              PRIV_STORAGE_VARIABLE_SET);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        destroyFetchContext(mulOpts.fctx);
        return -ENOMEM;
    }

    kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n");

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    if (isComplexType(dtype)) {
        genComplexMathOperators(ctx, dtype);
    }
    if(!isInline) {
        genTileInverting(ctx, &gset, &tileSet);
    }

    if ( extraParams->ldsUse != LDS_NO_USE ) {
        SubproblemDim sdims;
        DBlockCopyFlags flags;
        unsigned int vecLen;

        if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
            sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor;
            sdims.y = gset.subdims[0].x;
        }
        else {
            sdims.x = gset.subdims[0].x;
            sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor;
        }

        vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B);
        flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0;
        copyDataBlockGen(ctx, &sdims, gset.pgran, dtype,
                         DBLOCK_GLOBAL_TO_LOCAL, flags);
        kgenAddBlankLine(ctx);
        kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx);
    }

    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
                      true);
    kgenBeginFuncBody(ctx);

    declareLocalVariables(ctx, &gset, &parTile, extraParams);
    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
        kgenAddStmt(ctx, "A += offA;\n");
    }
    genTrxmBMatrShift(ctx, kflags, false);

    ptrName = dtypeUPtrField(dtype);

    sprintf(tmp, "uB.%s = B;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // external loop
    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
    kgenBeginBranch(ctx, tmp);
    genZeroTile(ctx, &gset.tileCY);
    genSetupCoords(ctx, &gset, BLOCK_UPDATE);

    kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n");

    gset.tileA = tileSet.rectA;
    gset.tileBX = tileSet.origB;

    if (!isMatrixUpper(kflags) && tailM) {
        addrMode |= FETCH_ADDR_A_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0);
    alignK = subdims[1].bwidth;
    if (ldsLarge) {
        alignK *= extraParams->unrollingFactor;
    }

    if (ldsLarge) {
        const char *oldCoordB;
        FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE;
        bool withSkew;

        withSkew = useSkewedFetchB(&gset);
        if (!withSkew) {
            bamode |= FETCH_ADDR_B_RELATIVE;
        }
        else {
            bamode |= FETCH_ADDR_B_CYCLICAL;
        }

        setFetchAddrMode(mulOpts.fctx, bamode);

        if (tailN) {
            /*
             * Conditional branch for those items which hit into
             * matrix B with their matrix coordinates
             */
            sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x);
            kgenBeginBranch(ctx, tmp);
        }

        if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
            kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName);
        }
        else {
            kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName);
        }

        if (withSkew) {
            unsigned int bwidthOld;

            oldCoordB = gset.varNames.coordB;
            gset.varNames.coordB = "skewX";
            bwidthOld = gset.subdims[0].bwidth;
            gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows :
                                                       parTile.nrCols;
            gset.subdims[0].bwidth = bwidthOld;
        }

        genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK);
        genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName);
        genInternalLoopEnd(ctx);                             // loop over K

        if (withSkew) {
            gset.varNames.coordB = oldCoordB;
            setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL);
            // deliver from skew in the result before proceed to the next stage
            genTileCyclicalShift(ctx, &gset);
        }

        if (tailN) {
            kgenEndBranch(ctx, NULL);
            kgenBeginBranch(ctx, "else");
        }

        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    if (!ldsLarge || tailN) {
        genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
        if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) {
            addrMode |= FETCH_ADDR_B_CYCLICAL;
            setFetchAddrMode(mulOpts.fctx, addrMode);
        }

        if (tailN) {
            sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")");
            kgenBeginBranch(ctx, tmp);
        }

        genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK);
        tileMulGen(ctx, &gset, &mulOpts);
        genInternalLoopEnd(ctx);                             // loop over K

        if (tailN) {
            kgenEndBranch(ctx, NULL);
        }

        if (extraParams->ldsUse & LDS_USE_LARGE) {
            kgenEndBranch(ctx, NULL);
        }
    }

    sprintf(tmp, "uA.%s = A;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // processing tails along update dimension
    if (isMatrixUpper(kflags) &&
        ((kflags & KEXTRA_TAILS_K_LOWER) ||
          (ldsLarge && extraParams->unrolledTail))) {

        unsigned int tailChunks;

        tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ?
            extraParams->unrolledTail : 1;

        if (tailN) {
            char hitCond[1024];

            sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")");
            sprintf(tmp, "if ((currM + %lu < M) && %s)",
                    subdims[0].y, hitCond);
        }
        else {
            sprintf(tmp, "if (currM + %lu < M)", subdims[0].y);
        }
        kgenBeginBranch(ctx, tmp);

        if (kflags & KEXTRA_TAILS_K_LOWER) {
            setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL);
            setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        }
        if (tailChunks > 1) {
            mulOpts.flags &= ~TILEMUL_NOT_INC_K;
            sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks);
            kgenBeginBranch(ctx, tmp);
        }

		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
        if (tailChunks > 1) {
            kgenEndBranch(ctx, NULL);
            mulOpts.flags |= TILEMUL_NOT_INC_K;
        }

        kgenEndBranch(ctx, NULL);
    }

    gset.tileA = tileSet.squareA;

    kgenAddStmt(ctx, "\n/*\n"
                     " * Stage 2. A part of work items multiply got result on "
                     "a respective\n"
                     " * inverted diagonal block, and the remaining ones wait. "
                     "Then they perform\n"
                     " * one step of further intermediate result evaluation as "
                     "multiplying tile by tile.\n"
                     " * It continues until the whole panel of the "
                     "matrix A is processed\n"
                     " */\n");

    // one must deal further with square blocks strictly
    gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y;

    sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y);
    kgenBeginBranch(ctx, tmp);

    if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
        sprintf(tmp, "const int bid = lid %% %u;\n\n",
                l1Pans);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Update the intermediate result multiply on the inverted diagonal tile,
     * and write back
     */
    genSetupCoords(ctx, &gset, TILE_UPDATE);

    sprintfStage2Condition(tmp, &gset, 0);
    ret = kgenBeginBranch(ctx, tmp);

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);
    upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA;
    genUpdateIntermResult(ctx, &gset, false, upFlags);

    kgenAddBlankLine(ctx);

    lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) &&
                    (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 &&
                    !(tailM || tailN) &&
                    !(upFlags & UPRES_NO_VECTORIZATION) &&
                    !isComplexType(kextra->dtype));

    /*
     * it's needed now to adjust addressing mode of A so as to don't
     * exceed the bound of A
     */
    if (tailM) {
        setFetchAddrMode(mulOpts.fctx,
                         addrMode | FETCH_ADDR_A_CYCLICAL |
                         FETCH_ADDR_K_CYCLICAL);
        extraNew.flags |= KEXTRA_TAILS_K_LOWER;
    }

    genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts);
    gset.tileBX = tileSet.bStage2;
    if (tailM) {
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
    }

    kgenAddStmt(ctx, "// Write back the given result\n");

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);

    if (lds_diagonal) {
       sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans);
    }

    genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags,
                                 NULL, NULL, lds_diagonal ? tmp : NULL);

    kgenEndBranch(ctx, NULL);   // multiply on the inverted tile path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    // continue the tile update
    kgenAddBlankLine(ctx);
    sprintfStage2Condition(tmp, &gset, 1);
    kgenBeginBranch(ctx, tmp);
    genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
    if (lds_diagonal) {
        // TODO: add here storing to LDS as well
    }
    else {
		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
    }
    kgenEndBranch(ctx, NULL);           // tile update path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    kgenEndBranch(ctx, NULL);           // second stage loop

    if (isMatrixUpper(kflags)) {
        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndBranch(ctx, NULL);           // loop over M

    ret = kgenEndFuncBody(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyFetchContext(mulOpts.fctx);
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
Exemplo n.º 3
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
    char tmp[2048];
    const char *typeName;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset, tgset, lset, gset1;
    CLBLASKernExtra kextraTmp;
    TileMulOpts mulOpts, tmulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    size_t vecLen = kextra->vecLen;
    const char *outTypeName;
    bool b;
    TilePostFetchPrivate pfPriv;
    struct symvPrivate priv;
    size_t wgSize;
    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
    bool isComplex = isComplexType(dtype);
    Tile tileb;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
    unsigned int cLocal;
    unsigned int nPlans;

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&pfPriv, 0, sizeof(pfPriv));
    memset(&priv, 0, sizeof(priv));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);

    declareSymvKernel(ctx, dtype, pgran, kflags);

    ret = kgenBeginFuncBody(ctx);
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    kgenAddStmt(ctx, "#define M actualN\n");
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;

    gset.kextra = kextra;
    gset.pgran = pgran;

    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);

    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    cLocal = wgSize / bStep;
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
                 gid, cLocal, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint n = coordA;\n");
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);
    kgenAddStmt(ctx, "actualN += startN;\n");

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "uint k = k0;\n");

    if (tailK) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, "if (n + Ktail < N)");
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        vnames->sizeM = "N";
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "N";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (rowMaj) {
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
    }
    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    upper ^= rowMaj;
    tra ^= rowMaj;
    if (upper ^ rowMaj && tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    gset.tileA.trans ^= !upper;
    tgset = gset;
    tmulOpts = mulOpts;

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK)
    {
            kextraTmp = *kextra;
            gset1 = gset;

            kextraTmp.vecLen = 1;
            gset1.kextra = &kextraTmp;

            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;

            gset1.tileBX.nrRows = 1;
            gset1.tileA.nrCols = 1;
            kextraTmp.vecLenA = 1;
    }


    if (isHoriz)
    {
        lset = gset;
        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
        lset.tileA.nrCols = lset.tileA.nrRows =
            lset.tileBX.nrRows = lset.subdims[1].y;

        kgenAddStmt(ctx, "// the diagonal\n");
        kgenBeginBranch(ctx, "if (k <= n)");
        kgenAddStmt(ctx, "uint k1 = k;\n");

        if (subdims[1].bwidth != subdims[1].y) {
            kgenAddStmt(ctx, "// the pred diagonal\n");
            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */
        }

        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
            lset.tileA.trans, lset.tileA.packed);
        declareOneTileStorage(ctx, &tileb);

        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                mulOpts.flags, kflags);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.tilea = lset.tileA;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        lset.tileA = tileb;
        mulOpts.postFetch = genPostFetchMirror;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &lset, &mulOpts);
        if (ret != 0) {
            return ret;
        }

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans = lset.tileA.trans ^= true;
        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        if (subdims[1].bwidth != subdims[1].y) {
            size_t width = umax(subdims[1].bwidth, subdims[1].y);
            kgenAddStmt(ctx, "// the post diagonal\n");
            if (tailK) {
                kgenBeginBranch(ctx, "if(k < N)");
            }
            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
                    lset.subdims[1].bwidth,
                    width, width, width,
                    lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */

            if (tailK) {
                kgenEndBranch(ctx, NULL);
                kgenBeginBranch(ctx, "else");
                /* Handle tail along vector X */

                kgenAddStmt(ctx, "N += Ntail;\n");

                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
                    lset.subdims[1].bwidth);
                kgenBeginBranch(ctx, tmp);

                gset1.tileA.trans = gset.tileA.trans;

                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset1, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
#endif

                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
                kgenEndBranch(ctx, NULL);
            }
        }

        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL);
    }
    else
    {

        kgenAddStmt(ctx, "// the diagonal\n");
        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

        kgenBeginBranch(ctx, NULL);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &gset;
        mulOpts.postFetch = genPostFetchVertDiag;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans ^= true;
        lset = gset;

        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, NULL);

        priv.diag = false;
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL); /* if */
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
    }
    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");

        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
        sprintf(tmp, "for (; k < N; k++)");
        kgenBeginBranch(ctx, tmp);

        gset1.tileA.trans = gset.tileA.trans;

        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset1, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
#endif

        kgenEndBranch(ctx, NULL);

        kgenBeginBranch(ctx, "else");

        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        pfPriv.gset = &tgset;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &tgset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        if (isHoriz) {
            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
        }
        else {
            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        }
        kgenAddStmt(ctx, tmp);

        kgenEndBranch(ctx, NULL); /* k loop */

        kgenBeginBranch(ctx, "if (k < N)");
        if (isHoriz) {
            kgenAddStmt(ctx, "k = n;\n");
        }
        else {
            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
            kgenAddStmt(ctx, tmp);
        }

        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        if (!isHoriz) {
            if (upper ^ rowMaj && tra) {
                tmulOpts.flags &= ~TILEMUL_TRA;
            }
            else {
                tmulOpts.flags |= TILEMUL_TRA;
            }
            kgenAddStmt(ctx, "Ktail = N - n;\n");
            priv.coord = true;
        }
        else {
            priv.coord = false;
        }
        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;


        ret = tileMulGen(ctx, &lset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        kgenEndBranch(ctx, NULL);

        kgenEndBranch(ctx, NULL);
    }


    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    sprintf(tmp,"(%s - startN)", vnames->coordA);
    setResultPos(ctx, kflags, tmp);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);
    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
Exemplo n.º 4
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
    //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
    bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    char tmp[4096];
    DataType dtype = kextra->dtype;
    bool doubleBased = isDoubleBasedType(dtype);
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    TilePostFetchPrivate pfPriv;
    unsigned int vecLen = kextra->vecLen;
    const char *outTypeName;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    const char *typeName;
    size_t wgSize;
    //unsigned int nStep = 32;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
    unsigned int cLocal;
    bool isComplex = isComplexType(dtype);
    unsigned int nPlans;

    typeName = dtypeBuiltinType(dtype);
    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    kgenDeclareUptrs(ctx, doubleBased);

    // now, generate the kernel
    declareGemvKernel(ctx, dtype, pgran, kflags);
    ret = kgenBeginFuncBody(ctx);
    kgenAddStmt(ctx, "// M always denotes length of Y "
                     "and N denotes length of X in the kernel\n");
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);
    cLocal = wgSize/bStep;

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;

    gset.pgran = pgran;
    gset.kextra = kextra;
    gset.flags = BGF_UPTRS;

    initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
                 gid, bStep, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    if (tailN) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "M";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    if (staggered) {
        vnames->k = "k1";
        sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
        kgenAddStmt(ctx, tmp);
    }

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    vnames->k = "k";
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailN) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");
        kgenBeginBranch(ctx, "if (k < N)");

        mulOpts.flags |= TILEMUL_SKEW_B;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = subdims[1].bwidth;
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    setResultPos(ctx, kflags, vnames->coordA);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
Exemplo n.º 5
0
void
genTest(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    bool separateFetch)
{
    char s[1024];
    Kstring kstr;
    char *tName, tVect[64], *ptrName;
    KernelVarNames *vnames = &gset->varNames;
    DataType dtype = gset->kextra->dtype;
    const SubproblemDim *subdims = gset->subdims;
    unsigned int vecLen = gset->kextra->vecLen;
    size_t m, n, k;
    unsigned int i, j;
    bool tra, trb, localA, localB, vecCoords;
    int ret;
    TileMulFlags flags = mulOpts->flags;
    FetchOpts fetchOpts;

    m = gset->subdims[1].y;
    n = gset->subdims[1].x;
    k = gset->subdims[1].bwidth;

    tra = ((flags & TILEMUL_TRA) != 0);
    trb = ((flags & TILEMUL_TRB) != 0);
    localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
    localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);

    vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0);

    tVect[0] = '\0';

    if (vecCoords && vecLen != 1) {
        sprintf(tVect, "%u", vecLen);
    }

    switch (dtype) {
    case TYPE_FLOAT:
        tName = "float";
        ptrName = "f";
        break;
    case TYPE_DOUBLE:
        tName = "double";
        ptrName = "d";
        break;
    case TYPE_COMPLEX_FLOAT:
        tName = "float2";
        ptrName = "f2v";
        break;
    case TYPE_COMPLEX_DOUBLE:
        tName = "double2";
        ptrName = "d2v";
        break;
    default:
        return;
    }

    if (vecCoords) {
        //Do not use GPtrs in fetching
        vnames->A = "A";
        vnames->B = "B";
    }
    else {
        vnames->A = localA ? "LAptr" : "((GPtr)A)";
        vnames->B = localB ? "LBptr" : "((GPtr)B)";
    }
    if (!localA) {
        vnames->lda = "lda";

    }
    if (!localB) {
        vnames->ldb = "ldb";
    }
    vnames->sizeM = "M";
    vnames->sizeN = "N";
    vnames->sizeK = "K";
    vnames->skewA = "skewA";
    vnames->skewB = "skewB";
    vnames->skewK = "skewK";
    vnames->coordA = "workItemM";
    vnames->coordB = "workItemN";
    vnames->k = "k";

    kgenAddBlankLine(ctx);
    sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n",
            ITEM_WORK_M, ITEM_WORK_N);
    kgenAddStmt(ctx, s);
    kgenAddStmt(ctx, "__kernel void\n");
    sprintf(s, "%s(\n", kernelName);
    kgenAddStmt(ctx, s);
    sprintf(s,"    %s alpha,\n", tName);
    kgenAddStmt(ctx, s);
    sprintf(s,"    __global %s%s *A,\n", tName, tVect);
    kgenAddStmt(ctx, s);
    sprintf(s,"    __global %s%s *B,\n", tName, tVect);
    kgenAddStmt(ctx, s);
    kgenAddStmt(ctx, "    uint M,\n"
                     "    uint N,\n"
                     "    uint K,\n");
    sprintf(s,
            "    __global %s *C,\n"
            "    const uint iter)\n", tName);
    kgenAddStmt(ctx, s);
    kgenBeginFuncBody(ctx);
    sprintf(s, "uint workItemM = %lu * get_global_id(0);\n"
               "uint workItemN = %lu * get_global_id(1);\n",
            m, n);
    kgenAddStmt(ctx, s);
    if ((flags & TILEMUL_SKEW_A) != 0) {
        kgenAddStmt(ctx, "uint skewA = 0u;\n");
    }
    if ((flags & TILEMUL_SKEW_B) != 0) {
        kgenAddStmt(ctx, "uint skewB = 0u;\n");
    }
    if ((flags & TILEMUL_SKEW_K) != 0) {
        kgenAddStmt(ctx, "uint skewK = 0u;\n");
    }

    if (localA) {
        sprintf(s, "__local %s LA[%lu];\n",
                tName, subdims[0].bwidth * subdims[0].y);
        kgenAddStmt(ctx, s);
    }
    else { //global A
        sprintf(s, "uint lda = %s;\n", tra ? "M" : "K");
        kgenAddStmt(ctx, s);
    }
    if (localB) {
        sprintf(s, "__local %s LB[%lu];\n",
                tName, subdims[0].bwidth * subdims[0].x);
        kgenAddStmt(ctx, s);
    }
    else { //global B
        sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N");
        kgenAddStmt(ctx, s);
    }

    initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY);
    declareTileStorages(ctx, gset);

    if (vecCoords) {
        size_t ha, hb;
        char *str;

        ha = tra ? k : m;
        hb = trb ? n : k;

        if (ha > 1) {
            str = s;
            str += sprintf(str, "uint%lu ca = {0", ha);
            for (i = 1; i < ha; i++) {
                str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen);
            }
            str += sprintf(str, "};\n");
            kgenAddStmt(ctx, s);
        }
        else {
            kgenAddStmt(ctx, "uint ca = 0;\n");
        }
        vnames->vectCoordA = "ca";

        if (hb > 1) {
            str = s;
            str += sprintf(str, "uint%lu cb = {0", hb);
            for (i = 1; i < hb; i++) {
                str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen);
            }
            str += sprintf(str, "};\n");
            kgenAddStmt(ctx, s);
        }
        else {
            kgenAddStmt(ctx, "uint cb = 0;\n");
        }
        vnames->vectCoordB = "cb";

//        uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3};
//        uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3};
    }

    kgenAddBlankLine(ctx);

    sprintf(s, "for (int it = 0; it < iter; it++)");
    kgenBeginBranch(ctx, s);

    if (!(localA && localB)) {
        kgenAddStmt(ctx, "uint k = 0;\n");
    }

    genZeroTile(ctx, &gset->tileCY);

    if (vecCoords) {
        char *coordsA[2] = {"workItemM", "k"};
        char *coordsB[2] = {"k", "workItemN"};
        sprintf(s, "A += %s * (lda / %u) + %s / %u;\n",
                coordsA[tra], vecLen, coordsA[1 - tra], vecLen);
        kgenAddStmt(ctx, s);
        sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n",
                coordsB[trb], vecLen, coordsB[1 - trb], vecLen);
        kgenAddStmt(ctx, s);
    }

    sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth);
    kgenBeginBranch(ctx, s);

    /* Copy data to local memory. We know that the size of matrix is the same
     * that the size of one block and use that.
     */
    if (localA) {
        sprintf(s,
                "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n"
                "wait_group_events(1, &evA);\n"
                "barrier(CLK_LOCAL_MEM_FENCE);\n",
                subdims[0].y * subdims[0].bwidth);
        kgenAddStmt(ctx, s);
        kgenAddStmt(ctx, "LPtr LAptr;\n");
        if (tra) {
            sprintf(s,
                    "LAptr.%s = LA + workItemM;\n", ptrName);
        }
        else {
            sprintf(s,
                    "LAptr.%s = LA + workItemM * %lu;\n",
                    ptrName, subdims[0].bwidth);
        }
        kgenAddStmt(ctx, s);
    }
    if (localB) {
        sprintf(s,
                "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n"
                "wait_group_events(1, &evB);\n"
                "barrier(CLK_LOCAL_MEM_FENCE);\n",
                subdims[0].x * subdims[0].bwidth);
        kgenAddStmt(ctx, s);
        kgenAddStmt(ctx, "LPtr LBptr;\n");
        if (trb) {
            sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n",
                    ptrName, subdims[0].bwidth);
        }
        else {
            sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName);
        }
        kgenAddStmt(ctx, s);
    }

    if (!separateFetch) {
        ret = tileMulGen(ctx, gset, mulOpts);
        checkRet(ret, "Multiplier");
    }
    else {
        Tile *tileA = &gset->tileA;
        Tile *tileB = &gset->tileBX;

        memset(&fetchOpts, 0, sizeof(fetchOpts));
        if (localA) {
            fetchOpts.memA = CLMEM_LOCAL_MEMORY;
        }
        if (localB) {
            fetchOpts.memB = CLMEM_LOCAL_MEMORY;
        }

        genFillTileWithNAN(ctx, tileA);
        genFillTileWithNAN(ctx, tileB);

        if (subdims[0].bwidth != subdims[1].bwidth) {
            sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
                    subdims[0].bwidth, k);
            kgenBeginBranch(ctx, s);
        }

#if JUST_MULTIPLICATION
        for (i = 0; i < tileA->nrRows; i++) {
            for(j = 0; j < tileA->nrCols; j++) {
                sprintfTileElement(&kstr, tileA, i, j, 1);
                sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j);
                kgenAddStmt(ctx, s);
            }
        }

        for (i = 0; i < tileB->nrRows; i++) {
            for(j = 0; j < tileB->nrCols; j++) {
                sprintfTileElement(&kstr, tileB, i, j, 1);
                sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j);
                kgenAddStmt(ctx, s);
            }
        }
#else
        fetchOpts.mrole = MATRIX_B;
        fetchOpts.lineOffset = 0;
        fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows;
        ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
        checkRet(ret, "Fetching tile b");

        fetchOpts.mrole = MATRIX_A;
        fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows;
        kgenAddBlankLine(ctx);
        fetchOpts.lineOffset = 0;
        ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts);
        checkRet(ret, "Fetching tile a");
#endif
        ret = genMulTiles(ctx, gset, mulOpts);
        checkRet(ret, "Multiplier");
#if ! JUST_MULTIPLICATION
        sprintf(s, "k += %lu;\n", k);
        kgenAddStmt(ctx, s);
#endif
        if (subdims[0].bwidth != subdims[1].bwidth) {
            kgenEndBranch(ctx, NULL);
        }
    }
    kgenEndBranch(ctx, NULL); // K loop
    kgenEndBranch(ctx, NULL); // iterations loop

    kgenAddBlankLine(ctx);

    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++) {
            sprintfTileElement(&kstr, &gset->tileCY, i, j, 1);
                sprintf(s,
                        "((GPtr)C).%s"
                    "[(%d + workItemM) * N  + %d + workItemN] = %s;\n",
                    ptrName, i, j, kstr.buf);
                kgenAddStmt(ctx, s);
            }
                }

    kgenEndFuncBody(ctx);
}
Exemplo n.º 6
0
static int
genLoopsK(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    char *tmp)
{
    KernelExtraFlags kflags = gset->kextra->flags;
    const size_t y0 = gset->subdims[0].y;
    const size_t bwidth = gset->subdims[1].bwidth;
    int ret;
    bool isRel = false;
    const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB;

    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA);
    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB);

    sprintf(tmp, "uint k0;\n");
    kgenAddStmt(ctx, tmp);

    if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER |
                    KEXTRA_TAILS_K_LOWER))) {

        FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE |
                                 FETCH_ADDR_K_RELATIVE;

        isRel = true;

        mulOpts->fctx = createFetchContext();
        if (mulOpts->fctx == NULL) {
            return -ENOMEM;
        }
        setFetchAddrMode(mulOpts->fctx, addrMode);

        gset->varNames.A = "pA";
        gset->varNames.B = "pB";
    }
    else {
        gset->flags |= BGF_UPTRS;
        kgenPrintf(ctx, "GPtr Ag, Bg;\n"
                        "\n"
                        "Ag.%s = A;\n"
                        "Bg.%s = B;\n\n",
                   inPtrNameA, inPtrNameB);
    }

    if (isMatrixUpper(kflags)) {
        if (isRel) {
            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
                   ) {
            case 0:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 1:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 2:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 3:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            }
        }

        sprintf(tmp,
            "for (k0 = kBegin; "
                "(k0 <= (kBegin + %luu))&&(k0 < M); "
                "k0 += %lu)",
            y0,
            bwidth);
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0;\n");

        mulOpts->postFetch = genTrxmPostFetchZero;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        //main triangle part
        sprintf(tmp,
            "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)",
            y0,
            gset->subdims[1].bwidth);

        kgenBeginBranch(ctx, tmp);

        mulOpts->postFetch = NULL;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        // matrix side part
        // should be calculated by item0 of each subgroup
        sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth);
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0;\n");

        resetFetchNumA(mulOpts);
        mulOpts->postFetch = genTrxmPostFetchZero;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

    }
    else {
        // lower
        size_t diagBlocks; //Number of bw *y blocks that fit in y*y square

        if (isRel) {
            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
                   ) {
            case 0:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 1:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 2:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 3:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            }
        }

        diagBlocks = divRoundUp(y0, bwidth);
        sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth);
        kgenAddStmt(ctx, tmp);

        // main triangle part
        sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)",
                diagBlocks);
        kgenBeginBranch(ctx, tmp);
        mulOpts->postFetch = NULL;
        // part without diagonal elements post fetch zeroing
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        // diagonal part
        sprintf(tmp, "for (; k0 < iterK; k0++)");
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0 * %lu;\n",
            bwidth);

        // diagonal blocks part
        mulOpts->postFetch = genTrxmPostFetchZero;
        resetFetchNumA(mulOpts);
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (isRel) {
        destroyFetchContext(mulOpts->fctx);
        mulOpts->fctx = NULL;
    }

    return 0;
}
Exemplo n.º 7
0
static int
genSubgLoopsK(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    SubgVarNames* pSubgVNames,
    size_t staggered)
{
    char tmp[1024];
    KernelExtraFlags kflags = gset->kextra->flags;
    const size_t y0 = gset->subdims[0].y;
    const size_t bw1 = gset->subdims[1].bwidth;
    const size_t bw0 = gset->subdims[0].bwidth;

    // bw, that will be used for diagonal block evaluation
    size_t diagBw1 = getVecLen( gset, CLBLAS_TRMM, MATRIX_A );

    // saving dimensions of tile A, that will be changed for
    // diagonal block
    size_t sDimA = gset->tileA.trans ?
        gset->tileA.nrRows:
        gset->tileA.nrCols;

    size_t sDimB = gset->tileBX.trans ?
        gset->tileBX.nrRows:
        gset->tileBX.nrCols;

    const CLBLASKernExtra* psKExtra = gset->kextra;
    CLBLASKernExtra diagKExtra;
    TilePostFetchPrivate postFPriv;
    int ret = 0;

    kgenPrintf( ctx, "uint k0;\n" );
    kgenPrintf( ctx, "uint kMax;\n" );

    // upper triangle case
    if (isMatrixUpper(kflags)) {

        // diagonal part ------------------------------------------------------

        // adjust tile and kextra settings for
        // processing diagonal block
        gset->subdims[1].bwidth = diagBw1;
        if ( gset->tileA.trans ) {
            gset->tileA.nrRows = diagBw1;
        }
        else {
            gset->tileA.nrCols = diagBw1;
        }
        if ( gset->tileBX.trans ) {
            gset->tileBX.nrRows = diagBw1;
        }
        else {
            gset->tileBX.nrCols = diagBw1;
        }
        memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) );
        diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA?
            diagBw1:
            psKExtra->vecLenA;
        diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB?
            diagBw1:
            psKExtra->vecLenB;
        gset->kextra = (const CLBLASKernExtra*)&diagKExtra;

        // Process the triangle block by the 0 item
        // of each subgroup
        kgenPrintf( ctx, "// k-coordinate of the end of diagonal block\n" );
        kgenPrintf( ctx, "// calculated to be aligned to bw1\n");
        kgenPrintf( ctx,
            "kMax = kBegin + %lu + (%lu - %lu%%(kBegin+%lu));\n",
            y0,
            bw1,
            bw1,
            y0);

        sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId );
        kgenBeginBranch( ctx, tmp );

        sprintf( tmp,
            "for( k0=kBegin; (k0<kMax)&&(k0<M); k0+=%lu )",
            diagBw1 );
        kgenBeginBranch( ctx, tmp );

        kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k );
        mulOpts->postFetch = genTrxmPostFetchZero;
        ret = tileMulGen( ctx, gset, mulOpts );
        if( 0 != ret ){
            return ret;
        }

        kgenEndBranch(ctx, NULL);// for()
        kgenEndBranch(ctx, NULL);// if( itemId.x == 0 )

        // Restore tile and kextra settings to the
        // original parameters
        gset->subdims[1].bwidth = bw1;
        if ( gset->tileA.trans ) {
            gset->tileA.nrRows = sDimA;
        }
        else {
            gset->tileA.nrCols = sDimA;
        }
        if ( gset->tileBX.trans ) {
            gset->tileBX.nrRows = sDimB;
        }
        else {
            gset->tileBX.nrCols = sDimB;
        }
        gset->kextra = psKExtra;

        // rectangle part -----------------------------------------------------
        kgenAddBlankLine( ctx );
        kgenPrintf( ctx, "k0 = kMax;\n" );
        if ( kflags & KEXTRA_TAILS_K_LOWER ) {

            kgenPrintf( ctx, "uint alignedK = M-(M%%%lu);\n", bw1 );
        }
        // strided access
        sprintf(tmp,
            "for ( k0 = k0+%s.x*%lu; k0 < %s; k0 += %lu )",
            pSubgVNames->itemId,
            bw1,
            ( kflags & KEXTRA_TAILS_K_LOWER )? "alignedK" : "M",
            bw0);

        kgenBeginBranch(ctx, tmp);
        // TODO: make staggered access operational with lower-K tails
        /*kgenPrintf( ctx,
            "%s = (kBegin+%d) + ( m0*64*(gid%%2) + k0 )%%(M-(kBegin+%d));\n",
            gset->varNames.k,
            diagW,
            diagW); */
        kgenPrintf( ctx, "%s = k0;\n", gset->varNames.k );

        mulOpts->postFetch = NULL;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        // rectangle tail part ------------------------------------------------

        if ( kflags & KEXTRA_TAILS_K_LOWER ) {

            kgenAddBlankLine( ctx );
            kgenPrintf( ctx,
                "// lower K tail is handled by item 0 of each subgroup\n");

            sprintf(tmp, "if( (%s.x == 0)&&(kMax < M) )", pSubgVNames->itemId);
            kgenBeginBranch( ctx, tmp );

            kgenPrintf( ctx, "%s = alignedK;\n", gset->varNames.k );
            postFPriv.fetchNumA = 0;
            postFPriv.gset = gset;
            mulOpts->postFetch = defaultTilePostFetch;
            mulOpts->postFetchPriv = &postFPriv;

            ret = tileMulGen( ctx, gset, mulOpts );
            if ( ret != 0 ) {
                return ret;
            }
            kgenEndBranch( ctx, NULL );
        }
    }
    // lower triangle case
    else {

        // rectangle part -----------------------------------------------------

        kgenPrintf( ctx, "kMax = currM - currM%%%lu;\n", bw1 );
        // strided access, staggered access
        sprintf( tmp,
            "for( k0 = 0; k0 < kMax; k0 += %lu )",
            bw0 );
        kgenBeginBranch( ctx, tmp );

        kgenPrintf( ctx, "%s=(k0+%s.x*%d+%d*gid)%%kMax;\n",
            gset->varNames.k,
            pSubgVNames->itemId,
            bw1,
            staggered/bw1*bw1 );

        mulOpts->postFetch = NULL;
        // part without diagonal elements post fetch zeroing
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch( ctx, NULL );

        // diagonal part ------------------------------------------------------

        // adjust tile and kextra settings for
        // processing diagonal block
        gset->subdims[1].bwidth = diagBw1;
        if ( gset->tileA.trans ) {
            gset->tileA.nrRows = diagBw1;
        }
        else {
            gset->tileA.nrCols = diagBw1;
        }
        if ( gset->tileBX.trans ) {
            gset->tileBX.nrRows = diagBw1;
        }
        else {
            gset->tileBX.nrCols = diagBw1;
        }
        psKExtra = gset->kextra;
        memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) );
        diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA?
            diagBw1:
            psKExtra->vecLenA;
        diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB?
            diagBw1:
            psKExtra->vecLenB;
        gset->kextra = (const CLBLASKernExtra*)&diagKExtra;

        // process the triangle block by the 0 item
        // of each subgroup
        sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId );
        kgenBeginBranch( ctx, tmp );

        sprintf( tmp,
            "for( k0 = kMax; (k0 < currM+%lu)&&(k0 < M); k0 += %lu )",
            y0,
            diagBw1 );
        kgenBeginBranch( ctx, tmp );

        kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k );
        mulOpts->postFetch = genTrxmPostFetchZero;
        resetFetchNumA(mulOpts);
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch( ctx, NULL );// for()
        kgenEndBranch( ctx, NULL );// if( itemId.x == 0 )

        // Restore tile and kextra settings to the
        // original parameters
        gset->subdims[1].bwidth = bw1;
        if ( gset->tileA.trans ) {
            gset->tileA.nrRows = sDimA;
        }
        else {
            gset->tileA.nrCols = sDimA;
        }
        if ( gset->tileBX.trans ) {
            gset->tileBX.nrRows = sDimB;
        }
        else {
            gset->tileBX.nrCols = sDimB;
        }
        gset->kextra = psKExtra;

    }

    return 0;
}