Example #1
0
int
declareOneTileStorage(struct KgenContext *ctx, const Tile *tile)
{
    char tmp[1024];
    const char *tname;
    int r;
    size_t size;

    getVectorTypeName(tile->dtype, tile->vecLen, &tname, NULL);
    size = tileVectorsNum(tile);
    if (tile->storType == PRIV_STORAGE_ARRAY) {
        sprintf(tmp, "%s %s[%lu];\n", tname, tile->baseName, size);
    }
    else {
        size_t i;
        char *p;

        sprintf(tmp, "%s %s0", tname, tile->baseName);
        p = tmp + strlen(tmp);
        for (i = 1; i < size; i++) {
            sprintf(p, ", %s%lu", tile->baseName, i);
            p += strlen(p);
        }
        strcpy(p, ";\n");
    }

    r = kgenAddStmt(ctx, tmp);

    return (r) ? -EOVERFLOW : 0;
}
Example #2
0
static void
genPreloadedTileMul(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    const Tile *parTile,
    const char* copy2LDSFuncName)
{
    char tmp[1024];
    KernelExtraFlags kflags = gset->kextra->flags;
    unsigned int bwidthOld;
    const char *oldNameB;
    const char *ptrName;

    getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName);
    kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName);
    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);

    if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
        sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    else {
        sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    kgenAddStmt(ctx, tmp);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "lB = lBMain;\n\n");

    mulOpts->memB = CLMEM_LOCAL_MEMORY;
    oldNameB = gset->varNames.B;
    bwidthOld = (unsigned int)gset->subdims[0].bwidth;
    gset->varNames.B = "lB";
    gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows :
                                                 parTile->nrCols;

    tileMulGen(ctx, gset, mulOpts);

    gset->varNames.B = oldNameB;
    gset->subdims[0].bwidth = bwidthOld;
    mulOpts->memB = CLMEM_GLOBAL_MEMORY;
}
Example #3
0
static void
genPointerUpdate(
    struct KgenContext *ctx,
    const char *ptrName,
    const char *ldName,
    size_t bwidth,
    size_t bheight,
    unsigned int vecLen,
    DataType dtype,
    BlasGenFlags gflags,
    bool rowMaj,
    bool isLocal)
{
    const char *uptr;
    Kstring tmp;
    const char *p;

    if (gflags & BGF_UPTRS) {
        getVectorTypeName(dtype, vecLen, NULL, &uptr);
        ksprintf(&tmp, "%s.%s", ptrName, uptr);
        p = tmp.buf;
    }
    else {
        p = ptrName;
    }

    if (rowMaj) {
        kgenPrintf(ctx, "%s += %lu;\n", p, bwidth / vecLen);
    }
    else if (isLocal) {
        kgenPrintf(ctx, "%s += %lu;\n",
                   p, bwidth * (bheight / vecLen));
    }
    else {
        Kstring ld;
        Kstring bwStr, madExpr;
        unsigned int scale;

        kstrcpy(&ld, ldName);
        ksprintf(&bwStr, "%lu", bwidth);
        scale = (gflags & BGF_LD_IN_VECTORS) ? 0 : vecLen;
        sprintfFastScalarMad(&madExpr, &bwStr, &ld, scale, NULL);
        kgenPrintf(ctx, "%s += %s;\n", p, madExpr.buf);
    }
}
Example #4
0
int
genMulTiles(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    const TileMulOpts *mulOpts)
{
    char s[32];
    const CLBLASKernExtra *kextra = gset->kextra;
    const char *tNameIn;
    unsigned int i;
    unsigned int iend;
    bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0);
    bool trb = ((mulOpts->flags & TILEMUL_TRB) != 0);
    TileMulCore core;
    int ret;

    ret = checkInput(gset, mulOpts);
    if (ret) {
        return ret;
    }

    getVectorTypeName(kextra->dtype, kextra->vecLen, &tNameIn, NULL);
    core = checkReplaceCore(gset, mulOpts->core, tra, trb);

    if (((core == TILEMUL_MULADD || isComplexType(kextra->dtype)) &&
          !tra && trb)) {
        sprintf(s,"%s sum;\n", tNameIn);
        kgenAddStmt(ctx, s);
    }

    iend = (unsigned int)((mulOpts->flags & TILEMUL_TRA) ?
                            gset->subdims[1].bwidth : gset->subdims[1].y);
    for (i = 0; i < iend; i++) {
        genMulLineOnTile(ctx, gset, mulOpts, i, true);
    }

    // just to get state
    ret = kgenAddStmt(ctx, NULL);

    return (ret) ? -EOVERFLOW : 0;
}
Example #5
0
/*
 * Generate cyclical tile shifting so as to convert the skewed
 * storing to "one-to-one", i. e. the first element in the tile
 * matches to the first element of the respective tile in the
 * output matrix.
 */
static void
genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset)
{
    const char *tname;
    Kstring k1, k2, *src, *dst, *ktmp;
    unsigned int row, col;
    unsigned int seglen;
    Tile *tileC = &gset->tileCY;

    seglen = tileLineSegmentLen(tileC);
    getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL);

    kgenAddStmt(ctx, "\n// deliver from skewing in the result\n");
    kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)");
    kgenPrintf(ctx, "%s tmp;\n\n", tname);

    src = &k1;
    dst = &k2;

    // Skewing may be used only in case of transposed C
    for (row = 0; row < tileC->nrRows; row += seglen) {
        sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen);
        kgenPrintf(ctx, "tmp = %s;\n", dst->buf);
        for (col = tileC->nrCols - 1; col > 0; col--) {
            sprintfTileElement(src, tileC, row, col - 1, seglen);
            kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf);
            // swap pointer
            ktmp = src;
            src = dst;
            dst = ktmp;
        }
        kgenPrintf(ctx, "%s = tmp;\n", dst->buf);
    }

    kgenEndBranch(ctx, NULL);
    kgenAddBlankLine(ctx);
}
Example #6
0
int
tileMulGen(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    const TileMulOpts *mulOpts)
{
    char s[MAX_LENGTH];
    unsigned int vlenA, vlenB;
    unsigned int i, iend; //counters
    // size_t m, n, subK;
    int ret = 0;
    TileMulFlags mflags = mulOpts->flags;
    bool tra = ((mflags & TILEMUL_TRA) != 0);
    bool trb = ((mflags & TILEMUL_TRB) != 0);
    bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
    bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
    bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0);
    bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0);
    bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0);
    const SubproblemDim *subdims = gset->subdims;
    size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth;
    TileMulCore core = mulOpts->core;
    DataType dtype = gset->kextra->dtype;
    const KernelVarNames *varNames = &gset->varNames;
    FetchOpts fetchOpts;
    struct FetchContext *fctx = mulOpts->fctx;
    FetchAddrMode addrMode;
    FetchOptLevel foptlev;
    struct StatementBatch *batch = NULL;
    const Tile *tile;

    memset(&fetchOpts, 0, sizeof(fetchOpts));
    fetchOpts.memA = mulOpts->memA;
    fetchOpts.memB = mulOpts->memB;

    kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n");

    getVecLens(gset, &vlenA, &vlenB, NULL);

    /* check generator input values */
    ret = checkInput(gset, mulOpts);
    if (ret) {
        return ret;
    }

    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
        sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
                subdims[0].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, s);
    }

    core = checkReplaceCore(gset, core, tra, trb);
    if (((core == TILEMUL_MULADD || isComplexType(dtype)) &&
            !tra && trb)) {

        unsigned int n;
        const char *tname;

        n = commonTileSegmentLen(&gset->tileA, &gset->tileBX);
        getVectorTypeName(gset->tileA.dtype, n, &tname, NULL);

        sprintf(s,"%s sum;\n", tname);
        kgenAddStmt(ctx, s);
    }

    // FIXME: remove this kludge for backward compatibility
    if (fctx == NULL) {
        fctx = createFetchContext();
        if (fctx == NULL) {
            return -ENOMEM;
        }
        fetchOpts.mulOpts = mulOpts;
    }
    //////////////////////////////////////////////////////

    foptlev = getFetchOptLevels(fctx);

    if ((gset->flags & BGF_WHOLE_A) && internalFetchB &&
        (foptlev & FOPTLEV_MERGE_FETCHES)) {

        batch = createStmtBatch();
        if (batch == NULL) {
            ret = -ENOMEM;
            goto out;
        }
    }

    /*
     * First, disable sharing internal variables of the fetch code for
     * the first call so as the fetch generator could declares it for the
     * first matrix. And then re-enable it when invoking the fetch for
     * the other matrix if it has been actually enabled.
     */

    disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);

    /*
     * fetch elements of the matrix B, by rows or by columns depending on
     * the transposing flag
     */
    if (internalFetchB) {
        tile = &gset->tileBX;
        fetchOpts.mrole = MATRIX_B;
        fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows;
        if (batch == NULL) {
            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
            if (!ret) {
                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
            }
        }
        else {
            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
        }
    }

    fetchOpts.mrole = MATRIX_A;

    if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) {
        enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);
    }

    if (ret) {
        goto out;
    }

    if (gset->flags & BGF_WHOLE_A) {
        tile = &gset->tileA;
        iend = (tra) ? tile->nrCols : tile->nrRows;
        fetchOpts.linesNum = iend;
        if (batch == NULL) {
            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
        }
        else {
            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
            ret = flushStmtBatch(ctx, batch);
            if (!ret) {
                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
            }
        }

        if (!ret) {
            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
        }
        if (ret) {
            goto out;

        }

        // main multiplying loop
        for (i = 0; i < iend; i++) {
            if (i) {
                kgenAddBlankLine(ctx);
            }
            genMulLineOnTile(ctx, gset, mulOpts, i, true);
        }
    }
    else {
        iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y);
        fetchOpts.linesNum = 1;

        // main multiplying loop
        for (i = 0; i < iend; i++) {
            if (i) {
                kgenAddBlankLine(ctx);
                revalidateFetchContext(fctx, MATRIX_A);
            }
            // fetch elements of matrix A from single row
            fetchOpts.lineOffset = i;
            genFetchInputTile(ctx, fctx, gset, &fetchOpts);
            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
            if (ret) {
                goto out;
            }
            genMulLineOnTile(ctx, gset, mulOpts, i, false);
        }
    }

    /*
     * increment K-related coordinates or pointers depending on addressing
     * mode
     */
    addrMode = getFetchAddrMode(fctx);
    if (addrMode & FETCH_ADDR_K_RELATIVE) {
        kgenAddBlankLine(ctx);
        genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth,
                         subdims[0].y, vlenA, dtype, gset->flags,
                         !tra, localA);

        genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth,
                         subdims[0].x, vlenB, dtype, gset->flags,
                         trb, localB);
    }
    else {
        if (incK && (varNames->k != NULL) && !(localA && localB)) {
            sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth);
            kgenAddStmt(ctx, s);
        }
    }

    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
        kgenEndBranch(ctx, NULL); // k1 loop
    }
    ret = kgenAddStmt(ctx, "/* ---------------------- */\n");
    ret = (ret) ? -EOVERFLOW : 0;

out:
    if (batch != NULL) {
        destroyStmtBatch(batch);
    }
    if (fctx != mulOpts->fctx) {
        destroyFetchContext(fctx);
    }

    return ret;
}
Example #7
0
static void
declareLocalVariables(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    Tile* parTile,
    TrsmExtraParams * extraParams)
{
    char tmp[1024];
    const SubproblemDim *dims = gset->subdims;
    const char* parTileTypeName = NULL;
    bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags,
                                   MATRIX_B);
    unsigned int locWidth;
    unsigned int tsize;
    unsigned int parTileSize;
    unsigned int l1Pans;
    unsigned int step;

    kgenAddStmt(ctx,
                 "const int lid = get_local_id(0);\n"
                 "const int gid = get_group_id(0);\n"
                 "GPtr uA, uB;\n"
                 "uint coordA, coordB;\n"
                 "uint m0 = 0, k0, m1;\n");

    if (isMatrixUpper(gset->kextra->flags)) {
        sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n",
                dims[0].y, dims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Declare private blocks.
     * The region 'b' stores in different time tiles of both
     * the input matrices and the result
     */

    declareTileStorages(ctx, gset);

    *parTile = gset->tileBX;

    if (extraParams->ldsUse) {
        tsize = dtypeSize(gset->kextra->dtype);
        l1Pans = (unsigned int)(dims[0].x / dims[1].x);

        parTile->vecLen = (trb) ? (unsigned int)dims[1].x
                                : (unsigned int)dims[1].bwidth;
        parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize);
        parTile->trans = trb;

       /*
        * Allocate enough space in the local area to fit several tiles
        * at the stage1 (according to the unrolled factor) and one tile
        * at the stage2
        */

        locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor;
        if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
            locWidth = umax(locWidth, (unsigned int)dims[1].y);
        }
        if (trb) {
            parTile->nrRows = locWidth;
            parTile->nrCols = (unsigned int)dims[0].x;
            step = (unsigned int)dims[1].x / parTile->vecLen;
        }
        else {
            parTile->nrRows = (unsigned int)dims[0].x;
            parTile->nrCols = locWidth;
            step = (unsigned int)dims[1].x * locWidth / parTile->vecLen;
        }

        parTileSize = tileVectorsNum(parTile);

        getVectorTypeName(gset->kextra->dtype, parTile->vecLen,
                          &parTileTypeName, NULL);

        sprintf(tmp, "__local %s tmpB[%i];\n"
                     "LPtr lB;\n"
                     "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n",
                parTileTypeName, parTileSize, l1Pans, step);
        kgenAddStmt(ctx, tmp);

        if (useSkewedFetchB(gset)) {
            kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n",
                       l1Pans, gset->subdims[1].x);
        }
    }

    kgenAddBlankLine(ctx);
}
Example #8
0
static void
genTileInverting(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    const TileSet *tileSet)
{
    char tmp[1024];
    const CLBLASKernExtra *kextra = gset->kextra;
    KernelExtraFlags kflags = kextra->flags;
    DataType dtype = kextra->dtype;
    const SubproblemDim *dim = &gset->subdims[1];
    unsigned int accLen;
    unsigned int i, j, k;
    Tile srcTile;
    Tile dstTile;
    bool isU, isComplex;
    bool isInlined = gset->flags & BGF_EXPLICIT_INLINE;
    const char* typeNameA;
    const char* typeNameB;

    memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile));
    memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile));

    getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL);
    getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL);
    isU = isMatrixUpper(kflags);
    isComplex = isComplexType(dtype);

    if (isComplex || dstTile.trans) {
        accLen = 1;
    }
    else {
        accLen = umin(srcTile.vecLen, dstTile.vecLen);
        accLen = umin(accLen, srcTile.nrCols);
    }

    if (!isInlined) {
        dstTile.baseName = "a";
        srcTile.baseName = "b";
        sprintf(tmp, "void\n"
                     "invertTile(%s *a, %s *b)\n",
                typeNameA, typeNameB);
        kgenDeclareFunction(ctx, tmp);
        kgenBeginFuncBody(ctx);
    }
    else {
        kgenAddStmt(ctx, "// Invert tile\n");
    }

    // made destination block unit
    genZeroTile(ctx, &dstTile);
    for (i = 0; i < dim->y; i++) {
        genSetUnitInTile(ctx, &dstTile, i, i);
    }
    kgenAddBlankLine(ctx);

    for (i = 0; i < dim->y; i++) {
        Kstring src, srcDiag, dst, dstLast;

        // current source diagonal element
        sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU);
        for (j = i; j < dim->y; j++) {
            // current source non diagonal element
            if (i) {
                sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU);
            }

            for (k = 0; k < dim->y; k += accLen) {
                // current updated vectorized element
                sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU);

                // update
                if (i) {
                    // last updated vectorized element
                    sprintfInvertedElement(&dstLast, &dstTile, i - 1, k,
                                           accLen, isU);
                    if (isComplex) {
                        sprintf(tmp, "%s -= mul(%s, %s);\n",
                                dst.buf, dstLast.buf, src.buf);
                    }
                    else {
                        sprintf(tmp, "%s -= %s * %s;\n",
                                dst.buf, dstLast.buf, src.buf);
                    }
                    kgenAddStmt(ctx, tmp);
                }

                // divide on the diagonal element
                if (j == i) {
                    if (isComplex) {
                        sprintf(tmp, "%s = div(%s, %s);\n",
                                dst.buf, dst.buf, srcDiag.buf);
                    }
                    else {
                        sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf);
                    }
                    kgenAddStmt(ctx, tmp);
                }
            }
        }
        if (i != dim->y - 1) {
            kgenAddBlankLine(ctx);
        }
    }

    if (!isInlined) {
        kgenEndFuncBody(ctx);
    }
    kgenAddBlankLine(ctx);

}
Example #9
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
    char tmp[2048];
    const char *typeName;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset, tgset, lset, gset1;
    CLBLASKernExtra kextraTmp;
    TileMulOpts mulOpts, tmulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    size_t vecLen = kextra->vecLen;
    const char *outTypeName;
    bool b;
    TilePostFetchPrivate pfPriv;
    struct symvPrivate priv;
    size_t wgSize;
    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
    bool isComplex = isComplexType(dtype);
    Tile tileb;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
    unsigned int cLocal;
    unsigned int nPlans;

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&pfPriv, 0, sizeof(pfPriv));
    memset(&priv, 0, sizeof(priv));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);

    declareSymvKernel(ctx, dtype, pgran, kflags);

    ret = kgenBeginFuncBody(ctx);
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    kgenAddStmt(ctx, "#define M actualN\n");
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;

    gset.kextra = kextra;
    gset.pgran = pgran;

    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);

    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    cLocal = wgSize / bStep;
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
                 gid, cLocal, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint n = coordA;\n");
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);
    kgenAddStmt(ctx, "actualN += startN;\n");

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "uint k = k0;\n");

    if (tailK) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, "if (n + Ktail < N)");
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        vnames->sizeM = "N";
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "N";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (rowMaj) {
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
    }
    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    upper ^= rowMaj;
    tra ^= rowMaj;
    if (upper ^ rowMaj && tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    gset.tileA.trans ^= !upper;
    tgset = gset;
    tmulOpts = mulOpts;

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK)
    {
            kextraTmp = *kextra;
            gset1 = gset;

            kextraTmp.vecLen = 1;
            gset1.kextra = &kextraTmp;

            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;

            gset1.tileBX.nrRows = 1;
            gset1.tileA.nrCols = 1;
            kextraTmp.vecLenA = 1;
    }


    if (isHoriz)
    {
        lset = gset;
        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
        lset.tileA.nrCols = lset.tileA.nrRows =
            lset.tileBX.nrRows = lset.subdims[1].y;

        kgenAddStmt(ctx, "// the diagonal\n");
        kgenBeginBranch(ctx, "if (k <= n)");
        kgenAddStmt(ctx, "uint k1 = k;\n");

        if (subdims[1].bwidth != subdims[1].y) {
            kgenAddStmt(ctx, "// the pred diagonal\n");
            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */
        }

        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
            lset.tileA.trans, lset.tileA.packed);
        declareOneTileStorage(ctx, &tileb);

        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                mulOpts.flags, kflags);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.tilea = lset.tileA;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        lset.tileA = tileb;
        mulOpts.postFetch = genPostFetchMirror;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &lset, &mulOpts);
        if (ret != 0) {
            return ret;
        }

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans = lset.tileA.trans ^= true;
        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        if (subdims[1].bwidth != subdims[1].y) {
            size_t width = umax(subdims[1].bwidth, subdims[1].y);
            kgenAddStmt(ctx, "// the post diagonal\n");
            if (tailK) {
                kgenBeginBranch(ctx, "if(k < N)");
            }
            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
                    lset.subdims[1].bwidth,
                    width, width, width,
                    lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */

            if (tailK) {
                kgenEndBranch(ctx, NULL);
                kgenBeginBranch(ctx, "else");
                /* Handle tail along vector X */

                kgenAddStmt(ctx, "N += Ntail;\n");

                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
                    lset.subdims[1].bwidth);
                kgenBeginBranch(ctx, tmp);

                gset1.tileA.trans = gset.tileA.trans;

                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset1, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
#endif

                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
                kgenEndBranch(ctx, NULL);
            }
        }

        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL);
    }
    else
    {

        kgenAddStmt(ctx, "// the diagonal\n");
        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

        kgenBeginBranch(ctx, NULL);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &gset;
        mulOpts.postFetch = genPostFetchVertDiag;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans ^= true;
        lset = gset;

        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, NULL);

        priv.diag = false;
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL); /* if */
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
    }
    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");

        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
        sprintf(tmp, "for (; k < N; k++)");
        kgenBeginBranch(ctx, tmp);

        gset1.tileA.trans = gset.tileA.trans;

        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset1, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
#endif

        kgenEndBranch(ctx, NULL);

        kgenBeginBranch(ctx, "else");

        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        pfPriv.gset = &tgset;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &tgset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        if (isHoriz) {
            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
        }
        else {
            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        }
        kgenAddStmt(ctx, tmp);

        kgenEndBranch(ctx, NULL); /* k loop */

        kgenBeginBranch(ctx, "if (k < N)");
        if (isHoriz) {
            kgenAddStmt(ctx, "k = n;\n");
        }
        else {
            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
            kgenAddStmt(ctx, tmp);
        }

        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        if (!isHoriz) {
            if (upper ^ rowMaj && tra) {
                tmulOpts.flags &= ~TILEMUL_TRA;
            }
            else {
                tmulOpts.flags |= TILEMUL_TRA;
            }
            kgenAddStmt(ctx, "Ktail = N - n;\n");
            priv.coord = true;
        }
        else {
            priv.coord = false;
        }
        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;


        ret = tileMulGen(ctx, &lset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        kgenEndBranch(ctx, NULL);

        kgenEndBranch(ctx, NULL);
    }


    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    sprintf(tmp,"(%s - startN)", vnames->coordA);
    setResultPos(ctx, kflags, tmp);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);
    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
Example #10
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
    //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
    bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    char tmp[4096];
    DataType dtype = kextra->dtype;
    bool doubleBased = isDoubleBasedType(dtype);
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    TilePostFetchPrivate pfPriv;
    unsigned int vecLen = kextra->vecLen;
    const char *outTypeName;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    const char *typeName;
    size_t wgSize;
    //unsigned int nStep = 32;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
    unsigned int cLocal;
    bool isComplex = isComplexType(dtype);
    unsigned int nPlans;

    typeName = dtypeBuiltinType(dtype);
    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    kgenDeclareUptrs(ctx, doubleBased);

    // now, generate the kernel
    declareGemvKernel(ctx, dtype, pgran, kflags);
    ret = kgenBeginFuncBody(ctx);
    kgenAddStmt(ctx, "// M always denotes length of Y "
                     "and N denotes length of X in the kernel\n");
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);
    cLocal = wgSize/bStep;

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;

    gset.pgran = pgran;
    gset.kextra = kextra;
    gset.flags = BGF_UPTRS;

    initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
                 gid, bStep, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    if (tailN) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "M";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    if (staggered) {
        vnames->k = "k1";
        sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
        kgenAddStmt(ctx, tmp);
    }

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    vnames->k = "k";
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailN) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");
        kgenBeginBranch(ctx, "if (k < N)");

        mulOpts.flags |= TILEMUL_SKEW_B;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = subdims[1].bwidth;
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    setResultPos(ctx, kflags, vnames->coordA);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
Example #11
0
static int
genLoopsK(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    char *tmp)
{
    KernelExtraFlags kflags = gset->kextra->flags;
    const size_t y0 = gset->subdims[0].y;
    const size_t bwidth = gset->subdims[1].bwidth;
    int ret;
    bool isRel = false;
    const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB;

    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA);
    getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB);

    sprintf(tmp, "uint k0;\n");
    kgenAddStmt(ctx, tmp);

    if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER |
                    KEXTRA_TAILS_K_LOWER))) {

        FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE |
                                 FETCH_ADDR_K_RELATIVE;

        isRel = true;

        mulOpts->fctx = createFetchContext();
        if (mulOpts->fctx == NULL) {
            return -ENOMEM;
        }
        setFetchAddrMode(mulOpts->fctx, addrMode);

        gset->varNames.A = "pA";
        gset->varNames.B = "pB";
    }
    else {
        gset->flags |= BGF_UPTRS;
        kgenPrintf(ctx, "GPtr Ag, Bg;\n"
                        "\n"
                        "Ag.%s = A;\n"
                        "Bg.%s = B;\n\n",
                   inPtrNameA, inPtrNameB);
    }

    if (isMatrixUpper(kflags)) {
        if (isRel) {
            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
                   ) {
            case 0:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 1:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 2:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 3:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            }
        }

        sprintf(tmp,
            "for (k0 = kBegin; "
                "(k0 <= (kBegin + %luu))&&(k0 < M); "
                "k0 += %lu)",
            y0,
            bwidth);
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0;\n");

        mulOpts->postFetch = genTrxmPostFetchZero;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        //main triangle part
        sprintf(tmp,
            "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)",
            y0,
            gset->subdims[1].bwidth);

        kgenBeginBranch(ctx, tmp);

        mulOpts->postFetch = NULL;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        // matrix side part
        // should be calculated by item0 of each subgroup
        sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth);
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0;\n");

        resetFetchNumA(mulOpts);
        mulOpts->postFetch = genTrxmPostFetchZero;
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

    }
    else {
        // lower
        size_t diagBlocks; //Number of bw *y blocks that fit in y*y square

        if (isRel) {
            switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) |
                    (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^
                     ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0))
                   ) {
            case 0:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 1:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 2:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            case 3:
                kgenPrintf(ctx,
                    "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n"
                    "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n",
                    inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB);
                break;
            }
        }

        diagBlocks = divRoundUp(y0, bwidth);
        sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth);
        kgenAddStmt(ctx, tmp);

        // main triangle part
        sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)",
                diagBlocks);
        kgenBeginBranch(ctx, tmp);
        mulOpts->postFetch = NULL;
        // part without diagonal elements post fetch zeroing
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        // diagonal part
        sprintf(tmp, "for (; k0 < iterK; k0++)");
        kgenBeginBranch(ctx, tmp);

        kgenPrintf( ctx,
            "coord.z = k0 * %lu;\n",
            bwidth);

        // diagonal blocks part
        mulOpts->postFetch = genTrxmPostFetchZero;
        resetFetchNumA(mulOpts);
        ret = tileMulGen(ctx, gset, mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (isRel) {
        destroyFetchContext(mulOpts->fctx);
        mulOpts->fctx = NULL;
    }

    return 0;
}