示例#1
0
// Generate complete vector-vector product
static void
genVecMul(
    struct KgenContext *ctx,
    unsigned int m,
    unsigned int n,
    const Tile *a,
    const Tile *b,
    const Tile *c,
    bool conjA,
    bool conjB,
    TileMulCore core,
    bool wholeA)
{
    unsigned int k;
    char tmp[MAX_LENGTH];
    Kstring elA, elB, elC;
    unsigned int vlen = 0;
    bool isComplex;
    bool isDouble;

    isDouble = isDoubleBasedType(c->dtype);
    isComplex = isComplexType(c->dtype);
    if ((core == TILEMUL_DOT) && !isComplex) {
        vlen = commonTileSegmentLen(a, b);
    }
    else {
        vlen = 1;
    }

    sprintfTileElement(&elC, c, m, n, 1);
    if (!wholeA) {
        m = 0;
        }

    for (k = 0; k < a->nrCols; k += vlen) {
        sprintfTileElement(&elA, a, m, k, vlen);
        sprintfTileElement(&elB, b, k, n, vlen);

        /*
         * Using 'dot' is not valid for complex, and replaced with '*' operator
         * for unvectorized real data
         */
        if ((core == TILEMUL_DOT) && (vlen > 1)) {
            sprintf(tmp, "%s += dot(%s, %s);\n",
                    elC.buf, elA.buf, elB.buf);
        }
        else if (isComplex) {
            Kstring expr;

            sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC, isDouble,
                                    conjA, conjB, core);
            kgenAddStmt(ctx, expr.buf);
        }
        else {
            genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core);
        }
    }
}
示例#2
0
static int trmmSubgGetDefaultDecomp( PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    void *pArgs)
{
    int itemsPerSubg = 4;
    int subgA = 8;
    int subgB = 2;

    int bw1 = 8;
    int x1 = 4;
    int y1 = 4;
    CLBlasKargs *kargs;

    DUMMY_ARG_USAGE(subdimsNum);

    if ( NULL == pArgs ) {
        return -EINVAL;
    }

    kargs = (CLBlasKargs *)pArgs;

    if( isComplexType(kargs->dtype) ){
        bw1 /= 2;
    }
    if( isDoubleBasedType(kargs->dtype) ){
        bw1 /= 2;
    }

    subdims[1].bwidth = bw1;
    subdims[1].x = subdims[1].itemX = x1;
    subdims[1].y = subdims[1].itemY = y1;

    subdims[0].bwidth = bw1 * itemsPerSubg;
    subdims[0].itemX = x1 * subgB;
    subdims[0].x = x1*subgB;

    subdims[0].itemY = y1*subgA;
    subdims[0].y = y1*subgA;

    pgran->wgDim = 1;
    pgran->wgSize[0] = 64;
    pgran->wgSize[1] = 1;

    return 0;
}
示例#3
0
/*
 * Generate one stage of vector-vector product. Iterating over M and N having
 * fixed coordinate over K.
 */
static void
genStagedVecMul(
    struct KgenContext *ctx,
    unsigned int lineA,
    unsigned int k,
    const Tile *a,
    const Tile *b,
    const Tile *c,
    bool conjA,
    bool conjB,
    TileMulCore core,
    bool wholeA)
{
    Kstring elA, elB, elC;
    unsigned int stepM, endM, stepN, vlenC;
    unsigned int i, j;
    unsigned int m, ma, ka;
    bool isDouble;
    bool isComplex;

    if (a->trans) {
        m = 0;
        endM = a->nrRows;
    }
    else {
        m = lineA;
        endM = m + 1;
    }

    isDouble = isDoubleBasedType(c->dtype);
    isComplex = isComplexType(c->dtype);

    if (( (c->trans == a->trans) || (c->trans == b->trans) ) &&
        !isComplex) {

        if (c->trans) {
            stepM = vlenC = commonTileSegmentLen(a, c);
            stepN = 1;
    }
    else {
            stepM = 1;
            stepN = vlenC = commonTileSegmentLen(b, c);
    }
    }
    else {
        stepM = stepN = 1;
        vlenC = 1;
    }

    ka = selectColA(a, k, wholeA);

    for (i = m; i < endM; i += stepM) {
        ma = selectRowA(a, i, wholeA);
        sprintfTileElement(&elA, a, ma, ka, stepM);

        for (j = 0; j < b->nrCols; j += stepN) {
            sprintfTileElement(&elB, b, k, j, stepN);
            sprintfTileElement(&elC, c, i, j, vlenC);

            if (isComplex) {
                Kstring expr;

                sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC,
                                        isDouble, conjA, conjB, core);
                kgenAddStmt(ctx, expr.buf);
            }
            else {
                genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core);
            }
        }
    }
}
示例#4
0
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    char tmp[1024];
    struct KgenContext *ctx;
    ssize_t ret;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    DataType dtype = kextra->dtype;
    KernelExtraFlags kflags = kextra->flags;
    CLBLASKernExtra extraNew;
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    const char *ptrName;
    UpdateResultFlags upFlags = 0;
    TilePostFetchPrivate pfPriv;
    unsigned int l1Pans;
    bool b;
    Tile parTile;
    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
    int ldsLarge, lds_diagonal;
    bool isInline;
    TileSet tileSet;
    char copy2LDSFuncName[FUNC_NAME_MAXLEN];
    TailStatus tailStatus = 0;
    FetchAddrMode addrMode = 0;
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    bool tailN = ((kflags & KEXTRA_TAILS_N) != 0);
    size_t alignK;

    if (pgran->wgDim != 1) {
        return -EINVAL;
    }

    l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);

    memset(&gset, 0, sizeof(gset));
    gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS;
    memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2);
    // there is not need in block structure along K
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    subdims = gset.subdims;

    /*
     * Since tiles are changed dynamically, e. g. in the main tilemul
     * loop they are rectangular, but at the second stage both A and B
     * tile storages are used for square tiles. One must adjust physical
     * vectorization accordindly, so as vector length might not be
     * greater than linear size of any tile
     */
    memcpy(&extraNew, kextra, sizeof(extraNew));
    extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y);
    extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y);

    gset.pgran = pgran;
    gset.kextra = &extraNew;
    initKernelVarNames(&gset.varNames);

    // multiplication options
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
    mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD :
                                                         TILEMUL_MULADD;
    mulOpts.postFetch = NULL;
    mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
    mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K;
    mulOpts.fctx = createFetchContext();
    if (mulOpts.fctx == NULL) {
        return -ENOMEM;
    }

    disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING);

    isInline = (gset.flags & BGF_EXPLICIT_INLINE);

    initTiles(&gset, &tileSet, subdims, kflags, dtype,
              PRIV_STORAGE_VARIABLE_SET);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        destroyFetchContext(mulOpts.fctx);
        return -ENOMEM;
    }

    kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n");

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    if (isComplexType(dtype)) {
        genComplexMathOperators(ctx, dtype);
    }
    if(!isInline) {
        genTileInverting(ctx, &gset, &tileSet);
    }

    if ( extraParams->ldsUse != LDS_NO_USE ) {
        SubproblemDim sdims;
        DBlockCopyFlags flags;
        unsigned int vecLen;

        if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
            sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor;
            sdims.y = gset.subdims[0].x;
        }
        else {
            sdims.x = gset.subdims[0].x;
            sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor;
        }

        vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B);
        flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0;
        copyDataBlockGen(ctx, &sdims, gset.pgran, dtype,
                         DBLOCK_GLOBAL_TO_LOCAL, flags);
        kgenAddBlankLine(ctx);
        kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx);
    }

    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
                      true);
    kgenBeginFuncBody(ctx);

    declareLocalVariables(ctx, &gset, &parTile, extraParams);
    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
        kgenAddStmt(ctx, "A += offA;\n");
    }
    genTrxmBMatrShift(ctx, kflags, false);

    ptrName = dtypeUPtrField(dtype);

    sprintf(tmp, "uB.%s = B;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // external loop
    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
    kgenBeginBranch(ctx, tmp);
    genZeroTile(ctx, &gset.tileCY);
    genSetupCoords(ctx, &gset, BLOCK_UPDATE);

    kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n");

    gset.tileA = tileSet.rectA;
    gset.tileBX = tileSet.origB;

    if (!isMatrixUpper(kflags) && tailM) {
        addrMode |= FETCH_ADDR_A_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0);
    alignK = subdims[1].bwidth;
    if (ldsLarge) {
        alignK *= extraParams->unrollingFactor;
    }

    if (ldsLarge) {
        const char *oldCoordB;
        FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE;
        bool withSkew;

        withSkew = useSkewedFetchB(&gset);
        if (!withSkew) {
            bamode |= FETCH_ADDR_B_RELATIVE;
        }
        else {
            bamode |= FETCH_ADDR_B_CYCLICAL;
        }

        setFetchAddrMode(mulOpts.fctx, bamode);

        if (tailN) {
            /*
             * Conditional branch for those items which hit into
             * matrix B with their matrix coordinates
             */
            sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x);
            kgenBeginBranch(ctx, tmp);
        }

        if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
            kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName);
        }
        else {
            kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName);
        }

        if (withSkew) {
            unsigned int bwidthOld;

            oldCoordB = gset.varNames.coordB;
            gset.varNames.coordB = "skewX";
            bwidthOld = gset.subdims[0].bwidth;
            gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows :
                                                       parTile.nrCols;
            gset.subdims[0].bwidth = bwidthOld;
        }

        genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK);
        genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName);
        genInternalLoopEnd(ctx);                             // loop over K

        if (withSkew) {
            gset.varNames.coordB = oldCoordB;
            setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL);
            // deliver from skew in the result before proceed to the next stage
            genTileCyclicalShift(ctx, &gset);
        }

        if (tailN) {
            kgenEndBranch(ctx, NULL);
            kgenBeginBranch(ctx, "else");
        }

        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    if (!ldsLarge || tailN) {
        genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
        if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) {
            addrMode |= FETCH_ADDR_B_CYCLICAL;
            setFetchAddrMode(mulOpts.fctx, addrMode);
        }

        if (tailN) {
            sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")");
            kgenBeginBranch(ctx, tmp);
        }

        genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK);
        tileMulGen(ctx, &gset, &mulOpts);
        genInternalLoopEnd(ctx);                             // loop over K

        if (tailN) {
            kgenEndBranch(ctx, NULL);
        }

        if (extraParams->ldsUse & LDS_USE_LARGE) {
            kgenEndBranch(ctx, NULL);
        }
    }

    sprintf(tmp, "uA.%s = A;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // processing tails along update dimension
    if (isMatrixUpper(kflags) &&
        ((kflags & KEXTRA_TAILS_K_LOWER) ||
          (ldsLarge && extraParams->unrolledTail))) {

        unsigned int tailChunks;

        tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ?
            extraParams->unrolledTail : 1;

        if (tailN) {
            char hitCond[1024];

            sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")");
            sprintf(tmp, "if ((currM + %lu < M) && %s)",
                    subdims[0].y, hitCond);
        }
        else {
            sprintf(tmp, "if (currM + %lu < M)", subdims[0].y);
        }
        kgenBeginBranch(ctx, tmp);

        if (kflags & KEXTRA_TAILS_K_LOWER) {
            setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL);
            setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        }
        if (tailChunks > 1) {
            mulOpts.flags &= ~TILEMUL_NOT_INC_K;
            sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks);
            kgenBeginBranch(ctx, tmp);
        }

		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
        if (tailChunks > 1) {
            kgenEndBranch(ctx, NULL);
            mulOpts.flags |= TILEMUL_NOT_INC_K;
        }

        kgenEndBranch(ctx, NULL);
    }

    gset.tileA = tileSet.squareA;

    kgenAddStmt(ctx, "\n/*\n"
                     " * Stage 2. A part of work items multiply got result on "
                     "a respective\n"
                     " * inverted diagonal block, and the remaining ones wait. "
                     "Then they perform\n"
                     " * one step of further intermediate result evaluation as "
                     "multiplying tile by tile.\n"
                     " * It continues until the whole panel of the "
                     "matrix A is processed\n"
                     " */\n");

    // one must deal further with square blocks strictly
    gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y;

    sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y);
    kgenBeginBranch(ctx, tmp);

    if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
        sprintf(tmp, "const int bid = lid %% %u;\n\n",
                l1Pans);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Update the intermediate result multiply on the inverted diagonal tile,
     * and write back
     */
    genSetupCoords(ctx, &gset, TILE_UPDATE);

    sprintfStage2Condition(tmp, &gset, 0);
    ret = kgenBeginBranch(ctx, tmp);

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);
    upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA;
    genUpdateIntermResult(ctx, &gset, false, upFlags);

    kgenAddBlankLine(ctx);

    lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) &&
                    (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 &&
                    !(tailM || tailN) &&
                    !(upFlags & UPRES_NO_VECTORIZATION) &&
                    !isComplexType(kextra->dtype));

    /*
     * it's needed now to adjust addressing mode of A so as to don't
     * exceed the bound of A
     */
    if (tailM) {
        setFetchAddrMode(mulOpts.fctx,
                         addrMode | FETCH_ADDR_A_CYCLICAL |
                         FETCH_ADDR_K_CYCLICAL);
        extraNew.flags |= KEXTRA_TAILS_K_LOWER;
    }

    genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts);
    gset.tileBX = tileSet.bStage2;
    if (tailM) {
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
    }

    kgenAddStmt(ctx, "// Write back the given result\n");

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);

    if (lds_diagonal) {
       sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans);
    }

    genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags,
                                 NULL, NULL, lds_diagonal ? tmp : NULL);

    kgenEndBranch(ctx, NULL);   // multiply on the inverted tile path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    // continue the tile update
    kgenAddBlankLine(ctx);
    sprintfStage2Condition(tmp, &gset, 1);
    kgenBeginBranch(ctx, tmp);
    genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
    if (lds_diagonal) {
        // TODO: add here storing to LDS as well
    }
    else {
		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
    }
    kgenEndBranch(ctx, NULL);           // tile update path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    kgenEndBranch(ctx, NULL);           // second stage loop

    if (isMatrixUpper(kflags)) {
        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndBranch(ctx, NULL);           // loop over M

    ret = kgenEndFuncBody(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyFetchContext(mulOpts.fctx);
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#5
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    char tmp[4096], tmp1[4096];
    char *p;
    // is the iteration over N, N at the top level
    const char *typeName;
    char fpref;
    DataType dtype = kextra->dtype;
    ssize_t ret;
    BlasGenSettings gset;
    BlkMulOpts mulOpts;
    unsigned int tsize;
    unsigned int vecLen, outVecLen;
    bool b;
    const char *outTypeName;
    unsigned int i;
    unsigned int nrRegs, regPitch;
    int tra, trb;
    char vect[2] = {'y', 'x'};

    const char *coordConstants =
        "const uint workItemM = get_global_id(0) * %lu;\n"
        "const uint workItemN = get_global_id(1) * %lu;\n"
        "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
        "uint vectK = (K + %u) / %u;\n";

    /*
     *  template for image based gemm preparation part
     *  for two dimensional work space
     */
    const char *localVariables =
        "uint k0;\n"
        "int2 coordA = (int2)(0, workItemM);\n"
        "int2 coordB = (int2)(0, workItemN);\n"
        "%s c[%u];\n\n";

    tsize = dtypeSize(dtype);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    if (isComplexType(dtype)) {
        regPitch = (unsigned int)subdims[1].x;
    }
    else {
        regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
                    sizeof(cl_float4) / tsize;
    }

    memset(&gset, 0, sizeof(gset));
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;
    initKernelVarNames(&gset.varNames, kextra->flags);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations and auxiliary functions
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    // now, generate the kernel

    sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
            typeName, typeName, typeName);
    kgenDeclareFunction(ctx, tmp);
    ret = kgenBeginFuncBody(ctx);

    // constants
    sprintf(tmp, coordConstants,
            subdims[1].y, subdims[1].x, subdims[1].y,
            vecLen - 1, vecLen);
    kgenAddStmt(ctx, tmp);

    /*
     * Calculate local buffer pitches, and then declare local
     * variables
     */
    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);

    sprintf(tmp, localVariables, outTypeName, nrRegs);
    kgenAddStmt(ctx, tmp);

    // check if offset exceeds matrix
    kgenAddStmt(ctx, "if ((workItemM >= M) ||"
                         "(workItemN >= N)) {\n"
                     "    return;\n"
                     "}\n");

    kgenAddStmt(ctx, "C += offsetC;\n");

    // zero C block
    sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
                 "    c[k0] = 0;\n"
                 "}\n\n",
            nrRegs);
    kgenAddStmt(ctx, tmp);

    // block multiplication inlined function
    sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)",
            subdims[1].bwidth / vecLen);
    kgenBeginBranch(ctx, tmp);

    mulOpts.aMobj = CLMEM_IMAGE;
    mulOpts.bMobj = CLMEM_IMAGE;
    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE;
    if (isComplexType(dtype)) {
        mulOpts.core = BLKMUL_SEPARATE_MULADD;
    }
    else {
        mulOpts.core = BLKMUL_MAD;
    }
    mulOpts.argNames.coordA = "coordA";
    mulOpts.argNames.coordB = "coordB";
    mulOpts.argNames.skewCol = "skewCol";
    mulOpts.argNames.skewRow = "skewRow";
    mulOpts.argNames.k = "k0";
    mulOpts.argNames.vectBoundK = "vectK";
    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
    if (ret) {
        destroyKgenContext(ctx);
        return -EOVERFLOW;
    }

    // update image coordinates
    sprintf(tmp, "\ncoordA.x += %lu;\n"
                 "coordB.x += %lu;\n",
            subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
    kgenAddStmt(ctx, tmp);

    kgenEndBranch(ctx, NULL);

    // reorder the given solution
    outVecLen = isComplexType(dtype) ? 1 : vecLen;
    p = tmp1;
    for (i = 0; i < regPitch / outVecLen; i++) {
        unsigned int k = (unsigned int)(subdims[1].y - 1) *
                         regPitch / outVecLen + i;

        sprintf(p,  "\n"
                    "    tmp = c[%u];\n"
                    "    for (j = %lu; j >= 0; j--) {\n"
                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
                    "    }\n"
                    "    c[%u] = tmp;\n",
                k, subdims[1].y - 2, regPitch / outVecLen,
                i, regPitch / outVecLen, i, i);
        p += strlen(p);
    }
    sprintf(tmp, "\n"
                 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
                 "    int j;\n"
                 "    %s tmp;\n"
                 "%s"
                 "}\n"
                 "\n",
                 outTypeName, tmp1);
    kgenAddStmt(ctx, tmp);

    tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A);
    trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B);
    sprintf(tmp, "coordA.%c = workItemM;\n"
                 "coordB.%c = workItemN;\n\n",
            vect[tra], vect[trb]);
    kgenAddStmt(ctx, tmp);

    // write back the tile evaluated
    generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#6
0
// Preparation function for images based kernel generator
static ssize_t
preparator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    char tmp[4096], conjStr[1024];
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    CopyImgFuncs copyImgFuncs;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset;
    unsigned int vecLen;
    unsigned int tsize;
    const char *typeName;
    char fpref;
    bool b;
    size_t localBufSize;
    ssize_t ret;
    const char *conjCond;

    const char *functionHeadA =
        "int tra, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "uint m = (gid / bpr) * %lu;\n"
        "uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "A += offsetA;\n"
        "tra = (!transA && order == clblasColumnMajor) ||\n"
        "      (transA && order == clblasRowMajor);\n"
        "if (m >= M) {\n"
        "     return;\n"
        "}\n";

    const char *functionHeadB =
        "int trb, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "const uint n = (gid / bpr) * %lu;\n"
        "const uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "B += offsetB;\n"
        "trb = (!transB && order == clblasRowMajor) ||\n"
        "      (transB && order == clblasColumnMajor);\n"
        "if (n >= N) {\n"
        "    return;\n"
        "}\n";

    // Distribute blocks across compute units and copy matrix A to image.
    // Transposition and filling with zeros in unaligned cases is made using
    // buffer in local memory.
    const char *copyToImageA =
        "//copy matrix A block\n"
        "y = m + %u <= M ? %u : M - m;\n"
        "x = k + %u <= K ? %u : K - k;\n"
        "aligned = (x == %u) && (y == %u) && %d;\n"
        "int atcase = aligned * 10 + tra;\n"
        "%s" // conjugated check
        "if (atcase != 10) {\n"
        "    %s((__local float4*)temp);\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "}\n"
        "switch(atcase) {\n"
        "case 10: //aligned, not transposed\n"
        "    %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n"
        "    break;\n"
        "%s" // conjugated case
        "case 1: //not aligned, transposed\n"
        "    // generic transposed global to local\n"
        "    %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n"
        "    break;\n"
        "case 0: //not aligned, not transposed\n"
        "    // generic global to local\n"
        "    %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n"
        "    break;\n"
        "case 11: //aligned, transposed\n"
        "    // optimized transposed global to local\n"
        "    %s((LPtr) temp, (GPtr)A, k, m, lda);\n"
        "    break;\n"
        "}\n"
        "if (atcase != 10) {\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "    %s(imgA, k / %u, m, (LPtr) temp);\n"
        "}\n"
        "\n";

    const char *copyToImageB =
            "//copy matrix B block\n"
            "y = n + %u <= N ? %u : N - n;\n"
            "x = k + %u <= K ? %u : K - k;\n"
            "aligned = (x == %u) && (y == %u) && %d;\n"
            "int atcase = aligned * 10 + trb;\n"
            "%s" // conjugated check
            "if (atcase != 10) {\n"
            "    %s((__local float4*)temp);\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "}\n"
            "switch (atcase) {\n"
            "case 10: //aligned, not transposed\n"
            "    %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n"
            "    break;\n"
            "%s" // conjugated case
            "case 1: //not aligned, transposed\n"
            "    // generic transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
            "    break;\n"
            "case 0: //not aligned, not transposed\n"
            "    // generic global to local\n"
            "    %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
            "    break;\n"
            "case 11: //transposed, aligned\n"
            "    // optimized transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
            "    break;\n"
            "}\n"
            "if (atcase != 10) {\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "    %s(imgB, k / %u, n, (LPtr)temp);\n"
            "}\n"
            "\n";

    memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
    memset(&gset, 0, sizeof(gset));

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    tsize = dtypeSize(dtype);

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    declareBlasEnums(ctx);

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;

    // generate necessary memory to image copying functions
    generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_GEMM, &gset);

    kgenAddBlankLine(ctx);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
        sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadA,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].y, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transA == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)A, m, k, lda);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_A]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageA,
                subdims[1].y, subdims[1].y, // y = m + dy <= M ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_A],
                copyImgFuncs.globalToImage[MATRIX_A],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_A],
                copyImgFuncs.localToImage[MATRIX_A],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }
    else { // PREP_B
        sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadB,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].x, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transB == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)B, n, k, ldb);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_B]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageB,
                subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_B],
                copyImgFuncs.globalToImage[MATRIX_B],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_B],
                copyImgFuncs.localToImage[MATRIX_B],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndFuncBody(ctx);

    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#7
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
    char tmp[2048];
    const char *typeName;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset, tgset, lset, gset1;
    CLBLASKernExtra kextraTmp;
    TileMulOpts mulOpts, tmulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    size_t vecLen = kextra->vecLen;
    const char *outTypeName;
    bool b;
    TilePostFetchPrivate pfPriv;
    struct symvPrivate priv;
    size_t wgSize;
    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
    bool isComplex = isComplexType(dtype);
    Tile tileb;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
    unsigned int cLocal;
    unsigned int nPlans;

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&pfPriv, 0, sizeof(pfPriv));
    memset(&priv, 0, sizeof(priv));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);

    declareSymvKernel(ctx, dtype, pgran, kflags);

    ret = kgenBeginFuncBody(ctx);
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    kgenAddStmt(ctx, "#define M actualN\n");
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;

    gset.kextra = kextra;
    gset.pgran = pgran;

    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);

    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    cLocal = wgSize / bStep;
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
                 gid, cLocal, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint n = coordA;\n");
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);
    kgenAddStmt(ctx, "actualN += startN;\n");

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "uint k = k0;\n");

    if (tailK) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, "if (n + Ktail < N)");
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        vnames->sizeM = "N";
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "N";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (rowMaj) {
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
    }
    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    upper ^= rowMaj;
    tra ^= rowMaj;
    if (upper ^ rowMaj && tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    gset.tileA.trans ^= !upper;
    tgset = gset;
    tmulOpts = mulOpts;

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK)
    {
            kextraTmp = *kextra;
            gset1 = gset;

            kextraTmp.vecLen = 1;
            gset1.kextra = &kextraTmp;

            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;

            gset1.tileBX.nrRows = 1;
            gset1.tileA.nrCols = 1;
            kextraTmp.vecLenA = 1;
    }


    if (isHoriz)
    {
        lset = gset;
        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
        lset.tileA.nrCols = lset.tileA.nrRows =
            lset.tileBX.nrRows = lset.subdims[1].y;

        kgenAddStmt(ctx, "// the diagonal\n");
        kgenBeginBranch(ctx, "if (k <= n)");
        kgenAddStmt(ctx, "uint k1 = k;\n");

        if (subdims[1].bwidth != subdims[1].y) {
            kgenAddStmt(ctx, "// the pred diagonal\n");
            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */
        }

        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
            lset.tileA.trans, lset.tileA.packed);
        declareOneTileStorage(ctx, &tileb);

        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                mulOpts.flags, kflags);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.tilea = lset.tileA;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        lset.tileA = tileb;
        mulOpts.postFetch = genPostFetchMirror;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &lset, &mulOpts);
        if (ret != 0) {
            return ret;
        }

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans = lset.tileA.trans ^= true;
        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        if (subdims[1].bwidth != subdims[1].y) {
            size_t width = umax(subdims[1].bwidth, subdims[1].y);
            kgenAddStmt(ctx, "// the post diagonal\n");
            if (tailK) {
                kgenBeginBranch(ctx, "if(k < N)");
            }
            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
                    lset.subdims[1].bwidth,
                    width, width, width,
                    lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */

            if (tailK) {
                kgenEndBranch(ctx, NULL);
                kgenBeginBranch(ctx, "else");
                /* Handle tail along vector X */

                kgenAddStmt(ctx, "N += Ntail;\n");

                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
                    lset.subdims[1].bwidth);
                kgenBeginBranch(ctx, tmp);

                gset1.tileA.trans = gset.tileA.trans;

                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset1, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
#endif

                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
                kgenEndBranch(ctx, NULL);
            }
        }

        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL);
    }
    else
    {

        kgenAddStmt(ctx, "// the diagonal\n");
        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

        kgenBeginBranch(ctx, NULL);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &gset;
        mulOpts.postFetch = genPostFetchVertDiag;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans ^= true;
        lset = gset;

        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, NULL);

        priv.diag = false;
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL); /* if */
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
    }
    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");

        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
        sprintf(tmp, "for (; k < N; k++)");
        kgenBeginBranch(ctx, tmp);

        gset1.tileA.trans = gset.tileA.trans;

        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset1, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
#endif

        kgenEndBranch(ctx, NULL);

        kgenBeginBranch(ctx, "else");

        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        pfPriv.gset = &tgset;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &tgset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        if (isHoriz) {
            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
        }
        else {
            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        }
        kgenAddStmt(ctx, tmp);

        kgenEndBranch(ctx, NULL); /* k loop */

        kgenBeginBranch(ctx, "if (k < N)");
        if (isHoriz) {
            kgenAddStmt(ctx, "k = n;\n");
        }
        else {
            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
            kgenAddStmt(ctx, tmp);
        }

        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        if (!isHoriz) {
            if (upper ^ rowMaj && tra) {
                tmulOpts.flags &= ~TILEMUL_TRA;
            }
            else {
                tmulOpts.flags |= TILEMUL_TRA;
            }
            kgenAddStmt(ctx, "Ktail = N - n;\n");
            priv.coord = true;
        }
        else {
            priv.coord = false;
        }
        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;


        ret = tileMulGen(ctx, &lset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        kgenEndBranch(ctx, NULL);

        kgenEndBranch(ctx, NULL);
    }


    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    sprintf(tmp,"(%s - startN)", vnames->coordA);
    setResultPos(ctx, kflags, tmp);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);
    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#8
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
    //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
    bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    char tmp[4096];
    DataType dtype = kextra->dtype;
    bool doubleBased = isDoubleBasedType(dtype);
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    TilePostFetchPrivate pfPriv;
    unsigned int vecLen = kextra->vecLen;
    const char *outTypeName;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    const char *typeName;
    size_t wgSize;
    //unsigned int nStep = 32;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
    unsigned int cLocal;
    bool isComplex = isComplexType(dtype);
    unsigned int nPlans;

    typeName = dtypeBuiltinType(dtype);
    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    kgenDeclareUptrs(ctx, doubleBased);

    // now, generate the kernel
    declareGemvKernel(ctx, dtype, pgran, kflags);
    ret = kgenBeginFuncBody(ctx);
    kgenAddStmt(ctx, "// M always denotes length of Y "
                     "and N denotes length of X in the kernel\n");
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);
    cLocal = wgSize/bStep;

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;

    gset.pgran = pgran;
    gset.kextra = kextra;
    gset.flags = BGF_UPTRS;

    initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
                 gid, bStep, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    if (tailN) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "M";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    if (staggered) {
        vnames->k = "k1";
        sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
        kgenAddStmt(ctx, tmp);
    }

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    vnames->k = "k";
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailN) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");
        kgenBeginBranch(ctx, "if (k < N)");

        mulOpts.flags |= TILEMUL_SKEW_B;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = subdims[1].bwidth;
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    setResultPos(ctx, kflags, vnames->coordA);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#9
0
int main(int argc, char *argv[])
{
    char out[1024*1024];
    CLBLASKernExtra kextra;
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    int i;
    cl_uint blockM = 4, blockN = 4, blockK = 8;
    struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1);
    FType alpha;
    cl_int err;
    unsigned int iterNum = 1;
    const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov";
    const struct option longOptions[] = {
            {"help", no_argument, NULL, 'h'},
            {"device", required_argument, NULL, 'd'},
            {"fetch", required_argument, NULL, 'f'},
            {"local", required_argument, NULL, 'l'},
            {"type", required_argument, NULL, 't'},
            {"a", required_argument, NULL, 'a'},
            {"b", required_argument, NULL, 'b'},
            {"skew", required_argument, NULL, 's'},
            {"globalcycling", required_argument, NULL, 'g'},
            {"iter", required_argument, NULL, 'i'},
            {"core", required_argument, NULL, 'c'},
            {"old", no_argument, NULL, 'o'},
            {"verbose", no_argument, NULL, 'v'},
            {NULL, 0, NULL, 0}
    };
    int nextOption;
    cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
    bool verbose = false;
    SubproblemDim *subdims = gset.subdims;
    bool separateFetch = false;

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&kextra, 0, sizeof(kextra));
    gset.kextra = &kextra;
    gset.flags |= BGF_WHOLE_A;
    mulOpts.core = TILEMUL_MAD;
    mulOpts.flags = TILEMUL_FORCE_VECTORIZATION;
    kextra.vecLen = 1;
    kextra.dtype = TYPE_FLOAT;

    alpha.f = 1;

    // parse command line
    do {
        nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL);
        switch (nextOption) {
        case 'h':
            printUsage(argv[0], EXIT_SUCCESS);
            break;
        case 'd':
            if (!strcmp("cpu", optarg)) {
                deviceType = CL_DEVICE_TYPE_CPU;
            }
            else if (!strcmp("gpu", optarg)) {
                deviceType = CL_DEVICE_TYPE_GPU;
            }
            else {
                printf("Unknown device type %s. Supported values are \"cpu\" "
                        "and \"gpu\".\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'f':
            kextra.vecLen = atoi(optarg);
            break;
        case 'l':
            if (!strcmp(optarg, "A")) {
                mulOpts.memA = CLMEM_LOCAL_MEMORY;
            }
            else if (!strcmp(optarg, "B")) {
                mulOpts.memB = CLMEM_LOCAL_MEMORY;
            }
            else {
                printf("Wrong matrix specified: %s. Supported values are "
                        "A, B.\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 't':
            if (!strcmp(optarg, "s")) {
                kextra.dtype = TYPE_FLOAT;
                alpha.f = 1;
            }
            else if (!strcmp(optarg, "d")) {
                kextra.dtype = TYPE_DOUBLE;
                alpha.d = 1;
            }
            else if (!strcmp(optarg, "c")) {
                kextra.dtype = TYPE_COMPLEX_FLOAT;
                alpha.f2.s[0] = 1;
                alpha.f2.s[1] = 0;
            }
            else if (!strcmp(optarg, "z")) {
                kextra.dtype = TYPE_COMPLEX_DOUBLE;
                alpha.d2.s[0] = 1;
                alpha.d2.s[1] = 0;
            }
            else {
                printf("Wrong type specified: %s. Supported values are "
                        "s, d, c, z.\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'a':
            if (!strcmp(optarg, "r")) {
                mulOpts.flags &= ~TILEMUL_TRA;
            }
            else if (!strcmp(optarg, "c")) {
                mulOpts.flags |= TILEMUL_TRA;
            }
            else {
                printf("Wrong tile a parameter specified: %s. Supported values "
                        "are \"r\", \"c\".\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'b':
            if (!strcmp(optarg, "r")) {
                mulOpts.flags &= ~TILEMUL_TRB;
            }
            else if (!strcmp(optarg, "c")) {
                mulOpts.flags |= TILEMUL_TRB;
            }
            else {
                printf("Wrong tile b order specified: %s. Supported values "
                        "are \"r\", \"c\".\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 's':
            if (!strcmp(optarg, "a")) {
                mulOpts.flags |= TILEMUL_SKEW_A;
            }
            else if (!strcmp(optarg, "b")) {
                mulOpts.flags |= TILEMUL_SKEW_B;
            }
            else if (!strcmp(optarg, "k")) {
                mulOpts.flags |= TILEMUL_SKEW_K;
            }
            else {
                printf("Wrong skew parameter specified: %s. Supported values "
                        "are \"a\", \"b\", \"k\"\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'g':
            if (!strcmp(optarg, "a")) {
                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
            }
            else if (!strcmp(optarg, "b")) {
                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
            }
            else if (!strcmp(optarg, "k")) {
                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
            }
            else {
                printf("Wrong global cycling parameter specified: %s. "
                        "Supported values are \"a\", \"b\", \"k\"\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'i':
            iterNum = atoi(optarg);
            break;
        case 'c':
            if (!strcmp("muladd", optarg)) {
                mulOpts.core = TILEMUL_MULADD;
            }
            else if (!strcmp("mad", optarg)) {
                mulOpts.core = TILEMUL_MAD;
            }
            else if (!strcmp("dot", optarg)) {
                mulOpts.core = TILEMUL_DOT;
            }
            else {
                printf("Unknown multiplier core %s. Supported values"
                        " are \"muladd\", \"mad\" and \"dot\".\n", optarg);
                exit(EXIT_FAILURE);
            }
            break;
        case 'o':
            separateFetch = false;
            break;
        case 'v':
            verbose = true;
            break;
        case -1:
            break;
        default:
            printUsage(argv[0], EXIT_FAILURE);
            break;
        }
    } while (nextOption != -1);

    if (optind + 2 >= argc) {
        printf("Error: Not all sizes are specified\n");
        printUsage(argv[0], EXIT_FAILURE);
    }
    blockM = atoi(argv[optind]);
    blockN = atoi(argv[optind + 1]);
    blockK = atoi(argv[optind + 2]);

    if ((mulOpts.memA == CLMEM_LOCAL_MEMORY ||
            mulOpts.memB == CLMEM_LOCAL_MEMORY) &&
            ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) {
        printf("One of matrixes is in local memory, "
                "disabling global cycling\n");
        mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC;
    }

    if (mulOpts.flags & TILEMUL_TRA) {
        kextra.flags |= KEXTRA_TRANS_A;
    }
    if (mulOpts.flags & TILEMUL_TRB) {
        kextra.flags |= KEXTRA_TRANS_B;
    }

    subdims[0].y = blockM * ITEM_WORK_M;
    subdims[0].x = blockN * ITEM_WORK_N;
    subdims[0].bwidth = blockK * ITEM_BLOCKS_K;
    subdims[1].y = blockM;
    subdims[1].x = blockN;
    subdims[1].bwidth = blockK;

    memset(out, 0, sizeof(out));

    i = isDoubleBasedType(kextra.dtype);
    kgenDeclareUptrs(ctx, i);
    genTest(ctx, &gset, &mulOpts, separateFetch);
    destroyKgenContext(ctx);

    printf("Kernel code: \n\"%s\"\n", out);
    err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha,
              &gset, mulOpts.flags, deviceType, verbose, iterNum);
    if (err != CL_SUCCESS) {
        printf("Test run failed, error %d\n", err);
        return EXIT_FAILURE;
    }
	return EXIT_SUCCESS;
}
示例#10
0
cl_int
run (
        const char *ker,
        cl_uint M,
        cl_uint N,
        cl_uint K,
        FType alpha,
        BlasGenSettings *gset,
        TileMulFlags flags,
        cl_device_type deviceType,
        bool verbose,
        unsigned int iterNum)
{
    cl_int err;
    cl_platform_id platform;
    cl_context ctx;
    cl_device_id device;
    cl_command_queue queue;
    cl_event evt;
    DataType dtype = gset->kextra->dtype;

    cl_mem bufA, bufB, bufC;
    FPtr A, B, C, C_naive;
    bool isComplex = isComplexType(dtype);
    bool isDouble = isDoubleBasedType(dtype);
    cl_uint nwords = (isComplex) ? 2 : 1;
    unsigned int tsize = dtypeSize(dtype);
    cl_kernel kernel;
    size_t i, j, k;
    size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    char log[100000];
    size_t logSize;
    cl_long sTime, fTime;
    cl_program program = NULL;

    clGetPlatformIDs(1, &platform, NULL);

    clGetDeviceIDs(platform, deviceType, 1, &device, NULL);

    ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    /* Prepare OpenCL kernel and its arguments */

    program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);

    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    clGetProgramBuildInfo (program,
            device,
            CL_PROGRAM_BUILD_LOG,
            sizeof(log),
            log,
            &logSize);
    printf("%s", log);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }

    kernel = clCreateKernel(program, kernelName, &err);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }
    /* Memory allocation */

    A.v = malloc(M * K * tsize);
    B.v = malloc(K * N * tsize);
    C.v = malloc(M * N * tsize);
    C_naive.v = malloc(M * N * tsize);

#if JUST_MULTIPLICATION
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }

#else
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }
#endif

    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * M * tsize, A.v, &err);
    if (err != CL_SUCCESS) {
        clReleaseKernel(kernel);
        return err;
    }

    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * N * tsize, B.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
        M * N * tsize, C.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    /* Argument setting and kernel execution */
    err = clSetKernelArg(kernel, 0, tsize, alpha.u);
    err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
    err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
    err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
    err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
    err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
    err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
    err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
        globalWorkSize, localWorkSize, 0,
        NULL, &evt);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clFinish(queue);
    err = clEnqueueReadBuffer (queue,
        bufC,
        CL_TRUE,
        0,
        M * N * tsize,
        C.v,
        0,
        NULL,
        NULL);

    /* Naive CPU multiplication */
    if (isDouble) {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_double2 val;
                    for (k = 0; k < K; k++) {
                        cl_double2 bkj = flags & TILEMUL_TRB ?
                                B.d2[j * K + k] : B.d2[k * N + j];
                        cl_double2 aik = flags & TILEMUL_TRA ?
                                A.d2[k * M + i] : A.d2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.d2[i * N + j].s[0] += val.s[0];
                        C_naive.d2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
                    val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
                    C_naive.d2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        double bkj = flags & TILEMUL_TRB ?
                                B.d[j * K + k] : B.d[k * N + j];
                        double aik = flags & TILEMUL_TRA ?
                                A.d[k * M + i] : A.d[i * K + k];
                        C_naive.d[i * N + j] += aik * bkj;
                    }
                    C_naive.d[i * N + j] *= alpha.d;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.d[i] != C_naive.d[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
                        C.d[i], C_naive.d[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }
    else {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_float2 val;
                    for (k = 0; k < K; k++) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.f2[i * N + j].s[0] += val.s[0];
                        C_naive.f2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
                    val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
                    C_naive.f2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        C_naive.f[i * N + j] += aik * bkj;
                    }
                    C_naive.f[i * N + j] *= alpha.f;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.f[i] != C_naive.f[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n",
                        i / N, i % N, C.f[i], C_naive.f[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }

    /* End of naive CPU multiplication */
    if (verbose) {
        if (!isDouble) {
            printf("Matrix A:\n");
            for (i = 0; i < M; i++) {
                for (k = 0; k < K; k++) {
                    if (isComplex) {
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
                    }
                    else {
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        printf("%4.1f ", aik);
                    }
                }
                printf("\n");
            }

            printf("Matrix B:\n");
            for (k = 0; k < K; k++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
                    }
                    else {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        printf("%4.1f ", bkj);
                    }
                }
                printf("\n");
            }

            printf("CPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C_naive.f2[i * N + j].s[0],
                                C_naive.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C_naive.f[i * N + j]);
                    }
                }
                printf("\n");
            }

            printf("GPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C.f[i * N + j]);
                    }
                }
                printf("\n");
            }
        }
    }

    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
            &sTime, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
            &fTime, NULL);

    printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
            (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));

    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);
    clReleaseKernel(kernel);
    return CL_SUCCESS;
}
示例#11
0
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    char tmp[4096];
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    DataType dtype = kextra->dtype;
    bool doubleBased = isDoubleBasedType(dtype);
    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
    int ret;
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A);
    int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B);
    unsigned int l1Pans;
    TilePostFetchPrivate pfPriv[2];
    UpdateResultFlags upResFlags;
    TailStatus tailStatus;
    bool subgMode = false;
    SubgVarNames subgVNames;

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // mismatching subdims define case with subgroup decomposition
    subgMode = ( subdims[0].bwidth != subdims[1].bwidth );

    memset(&gset, 0, sizeof(gset));
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.flags = BGF_DISTINCT_VECLEN;

    gset.flags |= BGF_WHOLE_A;

    /*FIXME: This used to be a workaround for compilation issues with dtrmm on
     * cpu. Normally BGF_WHOLE_A should be enabled always. But for now,
     * there are wrong results for non-aligned cases on CPU and there is
     * no workaround yet.
    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) {
        gset.flags &= ~BGF_WHOLE_A;
    }*/
    gset.kextra = kextra;
    gset.pgran = pgran;
    //avoid [0].bw loop
    //gset.subdims[0].bwidth = gset.subdims[1].bwidth;

    memset(pfPriv, 0, sizeof(pfPriv));
    pfPriv[0].funcID = CLBLAS_TRMM;
    pfPriv[0].gset = &gset;
    if ((gset.flags & BGF_WHOLE_A) != 0) {
        pfPriv[0].wholeA = 1;
    }

    // at first, generate needed declarations
    kgenDeclareUptrs(ctx, doubleBased);

    // For inner callback, because both callbacks use own fetchNumA
    memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0]));

    // if both matrices are accessed row-major - using subgroup pattern
    if ( subgMode ) {

        declareTrxmKernel(ctx,
            dtype,
            pgran,
            kflags,
            CLBLAS_TRMM,
            "Subgroup",
            true,
            true);
        gset.flags |= BGF_UPTRS;
    }
    else {

        declareTrxmKernel(ctx,
            dtype,
            pgran,
            kflags,
            CLBLAS_TRMM,
            "Block",
            true,
            true);

    }
    kgenBeginFuncBody(ctx);

    initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET);
    declareTileStorages(ctx, &gset);

    kgenAddStmt(ctx,
                "uint currM, currN;\n"
                "uint4 coord = 0; /* contains coordB, coordA, k */\n");

    kgenDeclareLocalID(ctx, "lid", pgran);
    kgenDeclareGroupID(ctx, "gid", pgran);

    if ( subgMode ) {

        gset.varNames.LDS = "scratch";

        // declaring variables used by subgroup mode
        subgVNames.itemId = "itemId";
        subgVNames.subgCoord = "subgCoord";

        kgenAddBlankLine( ctx );
        kgenAddBlankLine(ctx);

        kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId );
        kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord);

        // item ID
        kgenPrintf( ctx,
            "%s.x = get_local_id(0)%%%d;\n",
            subgVNames.itemId,
            subdims[0].bwidth/subdims[1].bwidth);

        // subgroup ID
        kgenPrintf( ctx,
            "%s.y = get_local_id(0)/%d;\n",
            subgVNames.itemId,
            subdims[0].bwidth/subdims[1].bwidth);

        // subgroup coordX
        kgenPrintf( ctx,
            "%s.x = %s.y/%d;\n",
            subgVNames.subgCoord,
            subgVNames.itemId,
            subdims[0].y/subdims[1].y );

        // subgroup coordY
        kgenPrintf( ctx,
            "%s.y = %s.y%%%d;\n",
            subgVNames.subgCoord,
            subgVNames.itemId,
            subdims[0].y/subdims[1].y );
    }

    kgenAddBlankLine(ctx);

    sprintf(tmp, "currN = gid * %lu;\n", subdims->x);
    kgenAddStmt(ctx, tmp);
    genInitCurrM(ctx, subdims, kflags);

    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
        kgenAddStmt(ctx, "A += offA;\n");
    }
    genTrxmBMatrShift(ctx, kflags, true);

    if ( subgMode ) {
        kgenAddStmt(ctx,
            "GPtr Ag = {A};\n"
            "GPtr Bg = {B};\n");
    }

    l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x;

    memset(&mulOpts, 0, sizeof(mulOpts));
    mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0)
            ? TILEMUL_MAD
            : TILEMUL_MULADD;
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
    mulOpts.postFetch = NULL;
    mulOpts.postFetchPriv = &pfPriv;
    mulOpts.flags = TILEMUL_NO_FLAGS;
    mulOpts.flags |= TILEMUL_EXTERN_RDECL;

    if ( subgMode ) {

        mulOpts.flags |= TILEMUL_NOT_INC_K;
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (kflags & KEXTRA_TAILS_M_LOWER) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
    }
    if (kflags & KEXTRA_TAILS_N_LOWER) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B;
    }
    if (kflags & KEXTRA_TAILS_K_LOWER) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K;
        mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL;
    }

    if (tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    if (!trb) {
        mulOpts.flags |= TILEMUL_TRB;
    }
    if (isMatrixConj(kflags, MATRIX_A)) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if (isMatrixConj(kflags, MATRIX_B)) {
        mulOpts.flags |= TILEMUL_CONJB;
    }

    initKernelVarNames(&gset.varNames);

    if ( subgMode ) {

        kgenPrintf( ctx,
            "coord.x = currN + %s.x*%d;\n",
            subgVNames.subgCoord,
            subdims[1].x );
    }
    else {

        sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x);
        kgenAddStmt(ctx, tmp);
    }

    // loop over M
    sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
    kgenBeginBranch(ctx, tmp);

    genStartPosK( ctx, subdims, kflags, subgMode );

    sprintf(tmp, "coord.z = kBegin;\n");
    kgenAddStmt(ctx, tmp);

    if ( subgMode ) {

        kgenPrintf(ctx,
            "coord.y = currM + %s.y*%d;\n",
            subgVNames.subgCoord,
            subdims[1].y);
    }
    else {

        sprintf( tmp,
            "coord.y = currM + lid / %u * %lu;\n",
            l1Pans,
            subdims[1].y );
        kgenAddStmt(ctx, tmp);
    }

    genZeroTile(ctx, &gset.tileCY);

    checkGenBeginHitMatrixBlock(ctx, kflags);
    tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL);

    // loops along 'K'
    if ( subgMode ) {
        ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered);
    }
    else {
        ret = genLoopsK( ctx, &gset, &mulOpts, tmp );
    }

    if (ret != 0) {
        printf("%s", buf);
        return ret;
    }

    checkGenEndHitMatrixBlock(ctx, kflags);
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    // store results
    // for result update - x coordinate is in elements, not in vectors

    checkGenRestoreTailCoords(ctx, &gset, tailStatus);
    upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags);
    upResFlags |= tailStatusToUpresFlags(tailStatus);
    upResFlags |= UPRES_INDEXING_WITH_CONSTANTS;
    upResFlags |= UPRES_TRIANG_WRITE_C;
    upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION;

    if ( subgMode ) {

        mergeUpdateResult( ctx,
            CLBLAS_TRMM,
            &gset,
            &subgVNames,
            upResFlags,
            genResultUpdateWithFlags );
    }
    else {

        //checkGenBeginHitMatrixBlock(ctx, kflags);
        genResultUpdateWithFlags( ctx,
            CLBLAS_TRMM,
            &gset,
            upResFlags,
            NULL,
            NULL,
            NULL );
        //checkGenEndHitMatrixBlock(ctx, kflags);
    }

    if (isMatrixUpper(kflags)) {
        sprintf(tmp, "currM += %lu;\n", subdims[0].y);
    }
    else {
        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
    }
    kgenAddStmt(ctx, tmp);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}