示例#1
0
int
genUpresFuncsWithFlags(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    UpdateResultFlags flags,
    char optFuncName[FUNC_NAME_MAXLEN],
    char genericFuncName[FUNC_NAME_MAXLEN])
{
    KernelExtraFlags kflags = gset->kextra->flags;
    UpdateResultOp op;
    int ret;

    op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;

    updateResultGenOld(ctx, gset, op, flags, NULL);
    ret = kgenAddBlankLine(ctx);
    if (ret) {
        return -EOVERFLOW;
    }

    kgenGetLastFuncName(optFuncName, FUNC_NAME_MAXLEN, ctx);

    if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) {
        flags |= UPRES_GENERIC;
        updateResultGenOld(ctx, gset, op, flags, NULL);
        kgenAddBlankLine(ctx);
        kgenGetLastFuncName(genericFuncName, FUNC_NAME_MAXLEN, ctx);
    }

    return (ret) ? -EOVERFLOW : 0;
}
示例#2
0
static void
genZeroTileTrash(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    MatrixRole mrole,
    Tile* tile)
{
    char tmp[1024];
    const SubproblemDim *dim = &gset->subdims[1];
    const CLBLASKernExtra *kextra = gset->kextra;
    unsigned int i, j;
    unsigned int step;
    Kstring elem;

    if (mrole == MATRIX_A) {
        kgenAddBlankLine(ctx);
    }
    else {
        kgenBeginBranch(ctx, NULL);
    }

    sprintf(tmp, "const int bound = (coordA + %lu > M) ? (M - coordA) : %lu;\n",
            dim->y, dim->y);
    kgenAddStmt(ctx, tmp);

    step = tileLineSegmentLen(tile);
    step = (tile->trans) ? 1 : step;

    for (j = 0; j < tile->nrRows; ++j) {
        for (i = 0; i < tile->nrCols; i+=step) {
            sprintfTileElement(&elem, tile, j, i, step);
            sprintf(tmp, "%s = (bound <= %u) ? 0 : %s;\n", elem.buf, j, elem.buf);
            kgenAddStmt(ctx, tmp);
        }
    }

    // Set units in the trash diagonal elements for a tile of A
    if (mrole == MATRIX_A) {
        for (i = 0; i < (unsigned int)dim->y; i++) {
            sprintfTileElement(&elem, tile, i, i, 1);
            sprintf(tmp, "%s = (bound <= %d) ? %s : %s;\n",
                    elem.buf, (int)i, strOne(kextra->dtype), elem.buf);
            kgenAddStmt(ctx, tmp);
        }
    }

    if (mrole == MATRIX_A) {
        kgenAddBlankLine(ctx);
    }
    else {
        kgenEndBranch(ctx, NULL);
    }
}
示例#3
0
void
genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile)
{
    char tmp[1024];
    Kstring elem;
    unsigned int incRows, incCols;
    unsigned int i, j, v;

    if (!tile->trans) {
        incRows = 1;
        v = incCols = umin(tile->vecLen, tile->nrCols);
    }
    else {
        v = incRows = umin(tile->vecLen, tile->nrRows);
        incCols = 1;
    }

    for (i = 0; i < tile->nrRows; i += incRows) {
        for (j = 0; j < tile->nrCols; j += incCols) {
            sprintfTileElement(&elem, tile, i, j, v);
            sprintf(tmp, "%s = NAN;\n", elem.buf);
            kgenAddStmt(ctx, tmp);
        }
    }

    kgenAddBlankLine(ctx);
}
示例#4
0
void
genZeroTile(struct KgenContext *ctx, const Tile *tile)
{
    char tmp[1024];
    Kstring elem;
    unsigned int incRows, incCols;
    unsigned int i, j, v;

    v = tileLineSegmentLen(tile);
    if (!tile->trans) {
        incRows = 1;
        incCols = v;
    }
    else {
        incRows = v;
        incCols = 1;
    }

    for (i = 0; i < tile->nrRows; i += incRows) {
        for (j = 0; j < tile->nrCols; j += incCols) {
            sprintfTileElement(&elem, tile, i, j, v);
            sprintf(tmp, "%s = 0;\n", elem.buf);
            kgenAddStmt(ctx, tmp);
        }
    }

    kgenAddBlankLine(ctx);
}
示例#5
0
static int
copyImgPostUnroll(struct KgenContext *ctx, void *priv)
{
    char tmp[1024];
    GenPriv *gpriv = (GenPriv*)priv;
    const char *vfield = dtypeUPtrField(gpriv->dtype);

    if (gpriv->work && gpriv->work->tail) {
        addCopyTailCode(ctx, gpriv);
    }

    kgenAddBlankLine(ctx);

    if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
        sprintf(tmp, "src.%s += %s;\n", vfield, gpriv->globLDName);
    }
    else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
        sprintf(tmp, "src.%s += %lu;\n", vfield, gpriv->lmemLD);
    }
    kgenAddStmt(ctx, tmp);
    if(gpriv->packed) {
        sprintf(tmp, "index++;\n");
    } else {
        sprintf(tmp, "y++;\n");
    }
    return kgenAddStmt(ctx, tmp);
}
示例#6
0
/*
 * Add statement setting initial coordinates pointer for image
 *
 */
static void
addSettingImageXYCode(
    struct KgenContext *ctx,
    const char *xName,
    const char *yName,
    const PGranularity *pgran,
    GenPriv *gpriv)
{
    char tmp[4096];
    const ItemWork *work = gpriv->work;
    size_t gsize = pgran->wgSize[0] * pgran->wgSize[1];

    if (gpriv->packed) {
        sprintf(tmp, "pLine = ((get_image_width(dst) - startX) * %d / %lu) * %lu;\n",
                FLOAT4_VECLEN / gpriv->nfloats, gpriv->dim->x, gpriv->lmemLD);
        kgenAddStmt(ctx, tmp);
        if (gpriv->dim->y < gsize) {
            sprintf(tmp, "index = %s / %u;\n", lidVarName,
                    work->itemsPerRow);
        }
        else {
            sprintf(tmp, "index = %s * %lu;\n", lidVarName,
                    work->nrRows);
        }
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "x = startX + (index * %lu) %% pLine / %u;\n", gpriv->dim->x,
                FLOAT4_VECLEN / gpriv->nfloats);
        kgenAddStmt(ctx, tmp);
        if (gpriv->dim->y < gsize) {
            sprintf(tmp, "x += (%s %% %u) * (%lu / %u / %u);\n", lidVarName,
                    work->itemsPerRow, gpriv->dim->x,
                    (FLOAT4_VECLEN / gpriv->nfloats), work->itemsPerRow);
            kgenAddStmt(ctx, tmp);
        }
        sprintf(tmp, "y = startY + (index * %lu) / pLine;\n", gpriv->dim->x);
        kgenAddStmt(ctx, tmp);
    }
    else {
        if (gpriv->dim->y < gsize) {
            sprintf(tmp, "%s = startX + %s %% %u * %lu / %d;\n",
                    xName, lidVarName, work->itemsPerRow, work->nrCols,
                    FLOAT4_VECLEN/gpriv->nfloats);
            kgenAddStmt(ctx, tmp);
            sprintf(tmp, "%s = startY + %s / %u;\n", yName, lidVarName,
                    work->itemsPerRow);
            kgenAddStmt(ctx, tmp);
        }
        else {
            sprintf(tmp, "%s = startX;\n", xName);
            kgenAddStmt(ctx, tmp);
            sprintf(tmp, "%s = startY + %s * %lu;\n", yName, lidVarName,
                    gpriv->work->nrRows);
            kgenAddStmt(ctx, tmp);
        }
    }

    kgenAddBlankLine(ctx);
}
示例#7
0
static int
copyMemSingleTransp(struct KgenContext *ctx, void *priv)
{
    char tmp[1024];
    GenPriv *gpriv = (GenPriv*)priv;
    const char *vfield;

    vfield = dtypeUPtrField(gpriv->dtype);
    kgenAddBlankLine(ctx);

    if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
        if (gpriv->locLDName) {
            sprintf(tmp, "*%s.%s = *%s.%s++;\n",
                    gpriv->dstName, vfield,
                    gpriv->srcName, vfield);
            kgenAddStmt(ctx, tmp);

            if (gpriv->conjugate) {
                sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n",
                        gpriv->dstName, vfield, gpriv->dstName,
                        vfield);
                kgenAddStmt(ctx, tmp);
            }
            sprintf(tmp, "%s.%s += %s;\n",
                    gpriv->dstName, vfield, gpriv->locLDName);
        }
        else {
            sprintf(tmp, "%s.%s[%lu] = *%s.%s++;\n",
                    gpriv->dstName, vfield,
                    gpriv->lmemLD * gpriv->cnt, gpriv->srcName,
                    vfield);
            if (gpriv->conjugate) {
                kgenAddStmt(ctx, tmp);
                sprintf(tmp, "%s.%s[%lu].y = -%s.%s[%lu].y;\n",
                        gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt,
                        gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt);
            }
        }
    }
    else {
        if (gpriv->locLDName) {
            sprintf(tmp, "*%s.%s++ = *%s.%s;\n"
                         "%s.%s += %s;\n",
                    gpriv->dstName, vfield,
                    gpriv->srcName, vfield,
                    gpriv->srcName, vfield, gpriv->locLDName);
        }
        else {
            sprintf(tmp, "*%s.%s++ = %s.%s[%lu];\n",
                    gpriv->dstName, vfield, gpriv->srcName, vfield,
                    gpriv->lmemLD * gpriv->cnt);
        }
    }
    gpriv->cnt++;

    return kgenAddStmt(ctx, tmp);
}
示例#8
0
/*
 * Add statement setting initial local pointer for the work item
 *
 * @ld: lead dimension for the local block in float words;
 *       if it's zero, the "ld" argument of a generated function is
 *       used instead
 */
static void
addSettingPtrCode(
    struct KgenContext *ctx,
    const char *ptrName,
    size_t ld,
    bool transpose,
    const PGranularity *pgran,
    GenPriv *gpriv)
{
    char tmp[4096];
    const char *vfield;
    const SubproblemDim *dim = gpriv->dim;
    const ItemWork *work = gpriv->work;
    size_t gsize;

    vfield = dtypeUPtrField(gpriv->dtype);
    gsize = pgran->wgSize[0] * pgran->wgSize[1];

    if (ld) {
        // offset between two rows and two elements in each row
        size_t roff, eoff;

        if (transpose) {
            roff = 1;
            eoff = ld;
        }
        else {
            roff = ld;
            eoff = 1;
        }

        if (dim->y < gsize) {
            sprintf(tmp, "%s.%s += (%s / %u) * %lu + (%s %% %u * %lu) * %lu;\n",
                    ptrName, vfield, lidVarName, work->itemsPerRow,
                    roff, lidVarName, work->itemsPerRow, work->nrCols, eoff);
        }
        else {
            sprintf(tmp, "%s.%s += %s * %lu * %lu;\n",
                    ptrName, vfield, lidVarName, work->nrRows, roff);
        }
    }
    else {
        if (dim->y < gsize) {
            sprintf(tmp, "%s.%s += (startRow + %s / %u) * %s + "
                                   "startCol + %s %% %u * %lu;\n",
                    ptrName, vfield, lidVarName, work->itemsPerRow,
                    gpriv->globLDName, lidVarName, work->itemsPerRow, work->nrCols);
        }
        else {
            sprintf(tmp, "%s.%s += (startRow + %s * %lu) * %s + startCol;\n",
                    ptrName, vfield, lidVarName, work->nrRows, gpriv->globLDName);
        }
    }

    kgenAddStmt(ctx, tmp);
    kgenAddBlankLine(ctx);
}
示例#9
0
static int
checkTriggerPostFetch(
    struct KgenContext *ctx,
    const TileMulOpts *mulOpts,
    MatrixRole mrole)
{
    int ret = 0;

    if (mulOpts->postFetch) {
        ret = mulOpts->postFetch(ctx, mrole, mulOpts->postFetchPriv);
        kgenAddBlankLine(ctx);
    }

    return ret;
}
示例#10
0
static void
genPreloadedTileMul(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileMulOpts *mulOpts,
    const Tile *parTile,
    const char* copy2LDSFuncName)
{
    char tmp[1024];
    KernelExtraFlags kflags = gset->kextra->flags;
    unsigned int bwidthOld;
    const char *oldNameB;
    const char *ptrName;

    getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName);
    kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName);
    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);

    if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
        sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    else {
        sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n",
            copy2LDSFuncName, gset->subdims[0].x);
    }
    kgenAddStmt(ctx, tmp);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "lB = lBMain;\n\n");

    mulOpts->memB = CLMEM_LOCAL_MEMORY;
    oldNameB = gset->varNames.B;
    bwidthOld = (unsigned int)gset->subdims[0].bwidth;
    gset->varNames.B = "lB";
    gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows :
                                                 parTile->nrCols;

    tileMulGen(ctx, gset, mulOpts);

    gset->varNames.B = oldNameB;
    gset->subdims[0].bwidth = bwidthOld;
    mulOpts->memB = CLMEM_GLOBAL_MEMORY;
}
示例#11
0
/*
 * Setup coordinates before beginning a trsm stage
 * A caller must ensure the strict stage sequence:
 * BLOCK_UPDATE -> TILE_UPDATE
 */
static void
genSetupCoords(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    enum TrsmStage stage)
{
    char tmp[1024];
    KernelExtraFlags kflags = gset->kextra->flags;
    const SubproblemDim *dims = gset->subdims;
    unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x);
    const char *s;

    s = isMatrixUpper(kflags) ? "currM" : "m0";
    sprintf(tmp, "coordA = %s + (lid / %u * %lu);\n",
            s, l1Pans, dims[1].y);
    kgenAddStmt(ctx, tmp);

    switch (stage) {
    case BLOCK_UPDATE:
        if (isMatrixUpper(kflags)) {
            sprintf(tmp, "k0 = currM + %lu;\n", dims[0].y);
        }
        else {
            sprintf(tmp, "k0 = 0;\n");
        }
        break;
    case TILE_UPDATE:
        if (isMatrixUpper(kflags)) {
            sprintf(tmp, "k0 = currM + %lu - m1 * %lu;\n",
                    dims[0].y - dims[1].y, dims[1].y);
        }
        else {
            sprintf(tmp, "k0 = m0 + m1 * %lu;\n", dims[1].y);
        }
        break;
    }

    kgenAddStmt(ctx, tmp);

    sprintf(tmp, "coordB = gid * %lu + (lid %% %u * %lu);\n",
            dims[0].x, l1Pans, dims[1].x);

    kgenAddStmt(ctx, tmp);
    kgenAddBlankLine(ctx);
}
示例#12
0
static void
genInitCurrM(
    struct KgenContext *ctx,
    const SubproblemDim *dim,
    KernelExtraFlags kflags)
{
    char tmp[1024];

    if (isMatrixUpper(kflags)) {
        strcpy(tmp, "currM = 0;\n");
    }
    else {
        sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y);
    }

    kgenAddStmt(ctx, tmp);
    kgenAddBlankLine(ctx);
}
示例#13
0
int
generateZeroingFuncs(
    ZeroFuncs *funcNames,
    struct KgenContext *ctx,
    const SubproblemDim *blasDim,
    const PGranularity *pgran,
    DataType dtype,
    ZeroGenHelperFlags flags)
{
    int ret = 0;
    SubproblemDim dim[MATRIX_ROLES_NUMBER];
    size_t tsize, nvecs;
    unsigned int i, j;

    tsize = dtypeSize(dtype);
    nvecs = fl4RowWidth(blasDim->bwidth, tsize);

    checkInitSubdim(&dim[MATRIX_A], flags, ZF_MATRIX_A, nvecs * blasDim->y, 1);
    checkInitSubdim(&dim[MATRIX_B], flags, ZF_MATRIX_B, nvecs * blasDim->x, 1);
    nvecs = fl4RowWidth(blasDim->x, tsize);
    checkInitSubdim(&dim[MATRIX_C], flags, ZF_MATRIX_C, nvecs * blasDim->y, 1);

    for (i = 0; (i < MATRIX_ROLES_NUMBER) && !ret; i++) {
        if (dim[i].x == SUBDIM_UNUSED) {
            continue;
        }

        // check whether the function is already generated
        j = lookupDim(dim, i);
        if (j != IDX_INVAL) {
            strcpy(funcNames->names[i], funcNames->names[j]);
        }
        else {
            ret = f4zeroBlockGen(ctx, &dim[i], pgran, "__local");
            if (!ret) {
                kgenGetLastFuncName(funcNames->names[i], FUNC_NAME_MAXLEN,
                                    ctx);
            }
            kgenAddBlankLine(ctx);
        }
    }

    return ret;
}
示例#14
0
static int
copyMemPostUnroll(struct KgenContext *ctx, void *priv)
{
    char tmp[1024];
    const char *s[2] = {"src", "dst"};
    GenPriv *gpriv = (GenPriv*)priv;
    int gdir;
    const char *vfield;

    gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;

    if (gpriv->work && gpriv->work->tail) {
        addCopyTailCode(ctx, gpriv);
    }

    if (!gpriv->transp) {
        kgenAddBlankLine(ctx);
    }

    // modify pointers
    vfield = dtypeUPtrField(gpriv->dtype);
    sprintf(tmp, "%s.%s += %s;\n", s[gdir], vfield, gpriv->globLDName);
    kgenAddStmt(ctx, tmp);

    if (gpriv->transp) {
        sprintf(tmp, "%s.%s++;\n", s[1 - gdir], vfield);
    }
    else {
        if (gpriv->locLDName) {
            sprintf(tmp, "%s.%s += %s;\n", s[1 - gdir],
                    vfield, gpriv->locLDName);
        }
        else {
            sprintf(tmp, "%s.%s += %lu;\n", s[1 - gdir],
                    vfield, gpriv->lmemLD);
        }
    }

    return kgenAddStmt(ctx, tmp);
}
示例#15
0
/*
 * Generate cyclical tile shifting so as to convert the skewed
 * storing to "one-to-one", i. e. the first element in the tile
 * matches to the first element of the respective tile in the
 * output matrix.
 */
static void
genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset)
{
    const char *tname;
    Kstring k1, k2, *src, *dst, *ktmp;
    unsigned int row, col;
    unsigned int seglen;
    Tile *tileC = &gset->tileCY;

    seglen = tileLineSegmentLen(tileC);
    getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL);

    kgenAddStmt(ctx, "\n// deliver from skewing in the result\n");
    kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)");
    kgenPrintf(ctx, "%s tmp;\n\n", tname);

    src = &k1;
    dst = &k2;

    // Skewing may be used only in case of transposed C
    for (row = 0; row < tileC->nrRows; row += seglen) {
        sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen);
        kgenPrintf(ctx, "tmp = %s;\n", dst->buf);
        for (col = tileC->nrCols - 1; col > 0; col--) {
            sprintfTileElement(src, tileC, row, col - 1, seglen);
            kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf);
            // swap pointer
            ktmp = src;
            src = dst;
            dst = ktmp;
        }
        kgenPrintf(ctx, "%s = tmp;\n", dst->buf);
    }

    kgenEndBranch(ctx, NULL);
    kgenAddBlankLine(ctx);
}
示例#16
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered;
    //yes, KEXTRA_TAILS_K because it is set if N % bw != 0
    bool tailN = ((kflags & KEXTRA_TAILS_K) != 0);
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    char tmp[4096];
    DataType dtype = kextra->dtype;
    bool doubleBased = isDoubleBasedType(dtype);
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    TilePostFetchPrivate pfPriv;
    unsigned int vecLen = kextra->vecLen;
    const char *outTypeName;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    const char *typeName;
    size_t wgSize;
    //unsigned int nStep = 32;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8;
    unsigned int cLocal;
    bool isComplex = isComplexType(dtype);
    unsigned int nPlans;

    typeName = dtypeBuiltinType(dtype);
    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    kgenDeclareUptrs(ctx, doubleBased);

    // now, generate the kernel
    declareGemvKernel(ctx, dtype, pgran, kflags);
    ret = kgenBeginFuncBody(ctx);
    kgenAddStmt(ctx, "// M always denotes length of Y "
                     "and N denotes length of X in the kernel\n");
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);
    cLocal = wgSize/bStep;

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;

    gset.pgran = pgran;
    gset.kextra = kextra;
    gset.flags = BGF_UPTRS;

    initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET);
    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n",
                 gid, bStep, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < M && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    if (tailN) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "M";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth;
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    if (staggered) {
        vnames->k = "k1";
        sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered);
        kgenAddStmt(ctx, tmp);
    }

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    vnames->k = "k";
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailN) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");
        kgenBeginBranch(ctx, "if (k < N)");

        mulOpts.flags |= TILEMUL_SKEW_B;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL;
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);
    }

    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        gset.subdims[0].bwidth = subdims[1].bwidth;
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    setResultPos(ctx, kflags, vnames->coordA);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#17
0
int
generateImageCopyFuncs(
    CopyImgFuncs *copyFuncs,
    struct KgenContext *ctx,
    BlasFunctionID funcID,
    const BlasGenSettings *gset)
{
    const SubproblemDim *dims = gset->subdims;
    KernelExtraFlags kflags = gset->kextra->flags;
    DataType dtype = gset->kextra->dtype;
    const PGranularity *pgran = gset->pgran;
    CopyPattern pattern;
    // mandatory flags for global to local copying
    DBlockCopyFlags glcpFlags[2] = {0, 0};
    struct KgenGuard *guard;
    unsigned int tsize;
    int ret = 0;
    bool isTra, areTails, isConjA;
    bool customize;

    if (kflags & KEXTRA_NO_COPY_VEC_A) {
        glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE;
    }
    if (kflags & KEXTRA_NO_COPY_VEC_B) {
        glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE;
    }

    tsize = dtypeSize(dtype);
    isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A);
    isConjA = isMatrixConj(kflags, MATRIX_A);
    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
    customize = (funcID == CLBLAS_TRMM);

    guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern));
    if (guard == NULL) {
        return -ENOMEM;
    }

    memset(&pattern, 0, sizeof(pattern));

    pattern.zeroing = false;
    pattern.dim = dims[0];
    pattern.dir = DBLOCK_GLOBAL_TO_IMAGE;
    pattern.dtype = dtype;
    pattern.flags = 0;
    pattern.generic = false;
    pattern.pgran = pgran;

    if (!(customize && (isTra || isConjA))) {
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[0].y;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[0].x;
    findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[1].y;
    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[1].x;
    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // Global to local optimized
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    if (customize || isComplexType(dtype)) {
        pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
        pattern.flags |= glcpFlags[0];
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[1].y;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) {
        pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1];
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[1].x;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    // Global to local generic
    pattern.dim = dims[0];
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    pattern.generic = true;
    if (!customize || areTails) {
        pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
        pattern.flags |= glcpFlags[0];
        findGenerateFunction(guard, &pattern,
                             copyFuncs->globalToLocalGeneric[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0;
    pattern.flags |= glcpFlags[1];
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalGeneric[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // Global to local transposed functions
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ?
                    DBLOCK_COPY_NOT_VECTORIZE : 0;
    pattern.flags |= glcpFlags[0];
    if (!customize || isTra) {
        pattern.generic = false;
        if (isConjA) {
            pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
        }
        else {
            pattern.flags |= DBLOCK_COPY_TRANSPOSE;
        }
        pattern.dim.x = dims[1].y;
        pattern.dim.y = dims[0].bwidth;

        findGenerateFunction(guard, &pattern,
                             copyFuncs->globalToLocalTransposed[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if (!customize || (isTra && areTails)) {
        pattern.generic = true;
        pattern.dim.x = 0;
        pattern.dim.y = 0;
        findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_A],
                         FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.generic = false;
    pattern.dim.x = dims[1].x;
    pattern.dim.y = dims[0].bwidth;
    if (kflags & KEXTRA_CONJUGATE_B) {
        pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
    }
    else {
        pattern.flags = DBLOCK_COPY_TRANSPOSE;
    }
    pattern.flags |= glcpFlags[1];
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposed[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.generic = true;
    pattern.dim.x = 0;
    pattern.dim.y = 0;
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // generate two local zeroing functions for matrix A and matrix B blocks
    pattern.zeroing = true;
    pattern.dim = dims[0];
    pattern.generic = false;
    pattern.flags = 0;
    pattern.dim.y = 1;
    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y;

    findGenerateFunction(guard, &pattern,
                         copyFuncs->zeroBlock[MATRIX_A],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x;
    findGenerateFunction(guard, &pattern,
                         copyFuncs->zeroBlock[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    ret = kgenAddBlankLine(ctx);

    destroyKgenGuard(guard);
    return ret;
}
示例#18
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    KernelExtraFlags kflags = kextra->flags;
    bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^
                  ((kflags & KEXTRA_COLUMN_MAJOR) != 0);
    char tmp[2048];
    const char *typeName;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset, tgset, lset, gset1;
    CLBLASKernExtra kextraTmp;
    TileMulOpts mulOpts, tmulOpts;
    KernelVarNames *vnames = &gset.varNames;
    ssize_t ret;
    size_t vecLen = kextra->vecLen;
    const char *outTypeName;
    bool b;
    TilePostFetchPrivate pfPriv;
    struct symvPrivate priv;
    size_t wgSize;
    bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
    bool tailK = (kflags & KEXTRA_TAILS_K) != 0;
    bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0;
    bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A);
    bool isComplex = isComplexType(dtype);
    Tile tileb;
    const char *gid = "get_group_id(0)";
    const char *lid = "get_local_id(0)";
    bool isHoriz = subdims[1].bwidth >= subdims[1].y;
    unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth;
    unsigned int cLocal;
    unsigned int nPlans;

    wgSize = (subdims[0].y / subdims[1].y) *
            (subdims[0].bwidth / subdims[1].bwidth);
    assert(pgran->wgSize[0] == wgSize);
    assert(subdims[0].x == 1);
    assert(subdims[1].x == 1);

    memset(&gset, 0, sizeof(gset));
    memset(&mulOpts, 0, sizeof(mulOpts));
    memset(&pfPriv, 0, sizeof(pfPriv));
    memset(&priv, 0, sizeof(priv));
    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);

    declareSymvKernel(ctx, dtype, pgran, kflags);

    ret = kgenBeginFuncBody(ctx);
    /* 1D work space. Matrix is divided among wi, each calculates it's own
     * part of vector y */

    kgenAddStmt(ctx, "#define M actualN\n");
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.subdims[0].itemX = gset.subdims[0].x = 1;
    gset.subdims[1].itemX = gset.subdims[1].x = 1;
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    gset.flags |= BGF_WHOLE_A | BGF_UPTRS;

    gset.kextra = kextra;
    gset.pgran = pgran;

    initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET);
    gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows);

    if (isComplex) {
         gset.tileCY.vecLen = 1;
    }
    declareTileStorages(ctx, &gset);
    genZeroTile(ctx, &gset.tileCY);
    getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL);
    cLocal = wgSize / bStep;
    nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen;

    sprintf(tmp, "__local %s localRes[%u][%u];\n",
                outTypeName, pgran->wgSize[0], nPlans);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n",
                 gid, cLocal, lid, bStep, subdims[1].y);
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint n = coordA;\n");
    kgenAddStmt(ctx, tmp);
    sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n",
                 lid,  bStep, subdims[1].bwidth);
    kgenAddStmt(ctx, tmp);
    kgenAddStmt(ctx, "actualN += startN;\n");

    kgenAddBlankLine(ctx);

    kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)");

    genIncPointers(ctx, kflags);
    sprintf(tmp,
            "const GPtr Ag = {(__global %s*)A};\n"
            "const GPtr Xg = {(__global %s*)X};\n",
            typeName, typeName);
    kgenAddStmt(ctx, tmp);

    kgenAddBlankLine(ctx);

    kgenAddStmt(ctx, "uint k = k0;\n");

    if (tailK) {
        sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, "if (n + Ktail < N)");
        kgenAddStmt(ctx, "N -= Ntail;\n");
        kgenAddBlankLine(ctx);
    }

    mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC;
    if (tailM) {
        vnames->sizeM = "N";
    }

    vnames->A = "Ag";
    vnames->B = "Xg";
    vnames->coordA = "coordA";
    vnames->coordB = ""; //should not be used for vector
    vnames->k = "k";
    vnames->lda = "lda";
    vnames->sizeK = "N";
    vnames->sizeM = "N";

    mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K;
    if ((kflags & KEXTRA_CONJUGATE_A) != 0) {
        mulOpts.flags |= TILEMUL_CONJA;
    }
    if ((kflags & KEXTRA_ENABLE_MAD) != 0) {
        mulOpts.core = TILEMUL_MAD;
    }
    else {
        mulOpts.core = TILEMUL_MULADD;
    }
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;

    if (rowMaj) {
        mulOpts.flags |= TILEMUL_BW_STRIDE;
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n");
    }
    sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)",
        subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    upper ^= rowMaj;
    tra ^= rowMaj;
    if (upper ^ rowMaj && tra) {
        mulOpts.flags |= TILEMUL_TRA;
    }
    gset.tileA.trans ^= !upper;
    tgset = gset;
    tmulOpts = mulOpts;

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK)
    {
            kextraTmp = *kextra;
            gset1 = gset;

            kextraTmp.vecLen = 1;
            gset1.kextra = &kextraTmp;

            gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1;

            gset1.tileBX.nrRows = 1;
            gset1.tileA.nrCols = 1;
            kextraTmp.vecLenA = 1;
    }


    if (isHoriz)
    {
        lset = gset;
        lset.subdims[0].bwidth = lset.subdims[1].bwidth =
            lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y);
        lset.tileA.nrCols = lset.tileA.nrRows =
            lset.tileBX.nrRows = lset.subdims[1].y;

        kgenAddStmt(ctx, "// the diagonal\n");
        kgenBeginBranch(ctx, "if (k <= n)");
        kgenAddStmt(ctx, "uint k1 = k;\n");

        if (subdims[1].bwidth != subdims[1].y) {
            kgenAddStmt(ctx, "// the pred diagonal\n");
            sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */
        }

        initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth,
            lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET,
            lset.tileA.trans, lset.tileA.packed);
        declareOneTileStorage(ctx, &tileb);

        genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                mulOpts.flags, kflags);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.tilea = lset.tileA;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        lset.tileA = tileb;
        mulOpts.postFetch = genPostFetchMirror;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &lset, &mulOpts);
        if (ret != 0) {
            return ret;
        }

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans = lset.tileA.trans ^= true;
        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        if (subdims[1].bwidth != subdims[1].y) {
            size_t width = umax(subdims[1].bwidth, subdims[1].y);
            kgenAddStmt(ctx, "// the post diagonal\n");
            if (tailK) {
                kgenBeginBranch(ctx, "if(k < N)");
            }
            sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)",
                    lset.subdims[1].bwidth,
                    width, width, width,
                    lset.subdims[1].bwidth);
            kgenBeginBranch(ctx, tmp);

            genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

            ret = tileMulGen(ctx, &lset, &mulOpts);
            if (ret != 0) {
                return ret;
            }
            kgenEndBranch(ctx, NULL); /* k loop */

            if (tailK) {
                kgenEndBranch(ctx, NULL);
                kgenBeginBranch(ctx, "else");
                /* Handle tail along vector X */

                kgenAddStmt(ctx, "N += Ntail;\n");

                mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
                sprintf(tmp, "for (k += %lu; k < actualN; k++)",
                    lset.subdims[1].bwidth);
                kgenBeginBranch(ctx, tmp);

                gset1.tileA.trans = gset.tileA.trans;

                genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset1, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
                kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
                mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
                genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                          mulOpts.flags, kflags);
                ret = tileMulGen(ctx, &gset, &mulOpts);
                if (ret != 0) {
                    return ret;
                }
#endif

                mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A;
                kgenEndBranch(ctx, NULL);
            }
        }

        sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL);
    }
    else
    {

        kgenAddStmt(ctx, "// the diagonal\n");
        sprintf(tmp, "if (k <= (n  + (get_local_id(0)%%%lu)*%lu))",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames,
                    mulOpts.flags, kflags);

        kgenBeginBranch(ctx, NULL);

        priv.mulOpts = &mulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &gset;
        mulOpts.postFetch = genPostFetchVertDiag;
        mulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        if (upper ^ rowMaj && tra) {
            mulOpts.flags &= ~TILEMUL_TRA;
        }
        else {
            mulOpts.flags |= TILEMUL_TRA;
        }
        gset.tileA.trans ^= true;
        lset = gset;

        sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
            subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenBeginBranch(ctx, NULL);

        priv.diag = false;
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL);

        mulOpts.postFetch = NULL;
        mulOpts.postFetchPriv = NULL;

        sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth);
        kgenAddStmt(ctx, tmp);
        kgenEndBranch(ctx, NULL); /* if */
    }

    if (upper) {
        kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n");
    }
    else {
        kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n");
    }
    sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
    kgenBeginBranch(ctx, tmp);

    genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
            mulOpts.flags, kflags);

    ret = tileMulGen(ctx, &gset, &mulOpts);
    if (ret != 0) {
        return ret;
    }
    kgenEndBranch(ctx, NULL); /* k loop */

    if (tailK) {
        /* Handle tail along vector X */
        kgenAddStmt(ctx, "N += Ntail;\n");

        mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A;
#if 1
        sprintf(tmp, "for (; k < N; k++)");
        kgenBeginBranch(ctx, tmp);

        gset1.tileA.trans = gset.tileA.trans;

        genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset1, &mulOpts);
        if (ret != 0) {
            return ret;
        }
        kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */
#else
        mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K;
        genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames,
                  mulOpts.flags, kflags);
        ret = tileMulGen(ctx, &gset, &mulOpts);
        if (ret != 0) {
            return ret;
        }
#endif

        kgenEndBranch(ctx, NULL);

        kgenBeginBranch(ctx, "else");

        sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth);
        kgenBeginBranch(ctx, tmp);

        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A;
        genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        pfPriv.gset = &tgset;
        priv.diag = false;

        pfPriv.funcID = CLBLAS_SYMV;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        ret = tileMulGen(ctx, &tgset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        if (isHoriz) {
            sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth);
        }
        else {
            sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n",
                subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
        }
        kgenAddStmt(ctx, tmp);

        kgenEndBranch(ctx, NULL); /* k loop */

        kgenBeginBranch(ctx, "if (k < N)");
        if (isHoriz) {
            kgenAddStmt(ctx, "k = n;\n");
        }
        else {
            sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n",
                subdims[1].y/subdims[1].bwidth, subdims[1].bwidth);
            kgenAddStmt(ctx, tmp);
        }

        genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames,
                tmulOpts.flags, kflags);

        priv.mulOpts = &tmulOpts;
        priv.pfPriv = &pfPriv;
        priv.diag = true;

        pfPriv.funcID = CLBLAS_SYMV;
        pfPriv.gset = &lset;
        tmulOpts.postFetch = genPostFetchDiag;
        tmulOpts.postFetchPriv = &priv;

        if (!isHoriz) {
            if (upper ^ rowMaj && tra) {
                tmulOpts.flags &= ~TILEMUL_TRA;
            }
            else {
                tmulOpts.flags |= TILEMUL_TRA;
            }
            kgenAddStmt(ctx, "Ktail = N - n;\n");
            priv.coord = true;
        }
        else {
            priv.coord = false;
        }
        tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K;


        ret = tileMulGen(ctx, &lset, &tmulOpts);
        if (ret != 0) {
            return ret;
        }

        kgenEndBranch(ctx, NULL);

        kgenEndBranch(ctx, NULL);
    }


    if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) {
        mulOpts.flags &= ~TILEMUL_BW_STRIDE;
    }

    kgenEndBranch(ctx,NULL);

    genStoreLocalResult(ctx, &gset.tileCY, lid);

    kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE);
    kgenAddBlankLine(ctx);

    sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep);
    kgenBeginBranch(ctx, tmp);

    genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1);

    /* write back the results */
    /* y := alpha*A*x + beta*y */
    sprintf(tmp,"(%s - startN)", vnames->coordA);
    setResultPos(ctx, kflags, tmp);

    updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY);

    kgenEndBranch(ctx, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);
    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);
    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#19
0
// Preparation function for images based kernel generator
static ssize_t
preparator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    char tmp[4096], conjStr[1024];
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    CopyImgFuncs copyImgFuncs;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset;
    unsigned int vecLen;
    unsigned int tsize;
    const char *typeName;
    char fpref;
    bool b;
    size_t localBufSize;
    ssize_t ret;
    const char *conjCond;

    const char *functionHeadA =
        "int tra, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "uint m = (gid / bpr) * %lu;\n"
        "uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "A += offsetA;\n"
        "tra = (!transA && order == clblasColumnMajor) ||\n"
        "      (transA && order == clblasRowMajor);\n"
        "if (m >= M) {\n"
        "     return;\n"
        "}\n";

    const char *functionHeadB =
        "int trb, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "const uint n = (gid / bpr) * %lu;\n"
        "const uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "B += offsetB;\n"
        "trb = (!transB && order == clblasRowMajor) ||\n"
        "      (transB && order == clblasColumnMajor);\n"
        "if (n >= N) {\n"
        "    return;\n"
        "}\n";

    // Distribute blocks across compute units and copy matrix A to image.
    // Transposition and filling with zeros in unaligned cases is made using
    // buffer in local memory.
    const char *copyToImageA =
        "//copy matrix A block\n"
        "y = m + %u <= M ? %u : M - m;\n"
        "x = k + %u <= K ? %u : K - k;\n"
        "aligned = (x == %u) && (y == %u) && %d;\n"
        "int atcase = aligned * 10 + tra;\n"
        "%s" // conjugated check
        "if (atcase != 10) {\n"
        "    %s((__local float4*)temp);\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "}\n"
        "switch(atcase) {\n"
        "case 10: //aligned, not transposed\n"
        "    %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n"
        "    break;\n"
        "%s" // conjugated case
        "case 1: //not aligned, transposed\n"
        "    // generic transposed global to local\n"
        "    %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n"
        "    break;\n"
        "case 0: //not aligned, not transposed\n"
        "    // generic global to local\n"
        "    %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n"
        "    break;\n"
        "case 11: //aligned, transposed\n"
        "    // optimized transposed global to local\n"
        "    %s((LPtr) temp, (GPtr)A, k, m, lda);\n"
        "    break;\n"
        "}\n"
        "if (atcase != 10) {\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "    %s(imgA, k / %u, m, (LPtr) temp);\n"
        "}\n"
        "\n";

    const char *copyToImageB =
            "//copy matrix B block\n"
            "y = n + %u <= N ? %u : N - n;\n"
            "x = k + %u <= K ? %u : K - k;\n"
            "aligned = (x == %u) && (y == %u) && %d;\n"
            "int atcase = aligned * 10 + trb;\n"
            "%s" // conjugated check
            "if (atcase != 10) {\n"
            "    %s((__local float4*)temp);\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "}\n"
            "switch (atcase) {\n"
            "case 10: //aligned, not transposed\n"
            "    %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n"
            "    break;\n"
            "%s" // conjugated case
            "case 1: //not aligned, transposed\n"
            "    // generic transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
            "    break;\n"
            "case 0: //not aligned, not transposed\n"
            "    // generic global to local\n"
            "    %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
            "    break;\n"
            "case 11: //transposed, aligned\n"
            "    // optimized transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
            "    break;\n"
            "}\n"
            "if (atcase != 10) {\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "    %s(imgB, k / %u, n, (LPtr)temp);\n"
            "}\n"
            "\n";

    memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
    memset(&gset, 0, sizeof(gset));

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    tsize = dtypeSize(dtype);

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    declareBlasEnums(ctx);

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;

    // generate necessary memory to image copying functions
    generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_GEMM, &gset);

    kgenAddBlankLine(ctx);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
        sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadA,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].y, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transA == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)A, m, k, lda);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_A]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageA,
                subdims[1].y, subdims[1].y, // y = m + dy <= M ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_A],
                copyImgFuncs.globalToImage[MATRIX_A],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_A],
                copyImgFuncs.localToImage[MATRIX_A],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }
    else { // PREP_B
        sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadB,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].x, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transB == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)B, n, k, ldb);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_B]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageB,
                subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_B],
                copyImgFuncs.globalToImage[MATRIX_B],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_B],
                copyImgFuncs.localToImage[MATRIX_B],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndFuncBody(ctx);

    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#20
0
static int
copyMemVecTransp(struct KgenContext *ctx, void *priv)
{
    char tmp[1024];
    size_t i;
    GenPriv *gpriv = (GenPriv*)priv;
    unsigned int n = gpriv->nfloats;
    const char *tmpSuff[2][4] = {
            {"x", "y", "z", "w"},
            {"xy", "zw", NULL, NULL}};
    const char *dstSuff[4] = {"f", "f2v", NULL, "f4v"};
    const char *vfield;
    const char *s;

    vfield = dtypeUPtrField(gpriv->dtype);
    kgenAddBlankLine(ctx);

    if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) {
        sprintf(tmp, "tmp = *%s.f4v++;\n", gpriv->srcName);
        kgenAddStmt(ctx, tmp);

        if (gpriv->conjugate) {
            /*
             * Only complex float element can be conjugated here,
             * those of double complex type are processed with no vectrized
             * function
             */
            kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"
                             "tmp.w = -tmp.w;\n");
        }

        for (i = 0; i < FLOAT4_VECLEN / n; i++) {
            if (gpriv->locLDName) {
                sprintf(tmp, "%s.%s[%s * %lu] = tmp.%s;\n",
                        gpriv->dstName, dstSuff[n - 1],
                        gpriv->locLDName, i, tmpSuff[n - 1][i]);
            }
            else {
                sprintf(tmp, "%s.%s[%lu] = tmp.%s;\n", gpriv->dstName,
                        dstSuff[n - 1], gpriv->lmemLD * i, tmpSuff[n - 1][i]);
            }
            kgenAddStmt(ctx, tmp);
        }
        s = gpriv->dstName;
    }
    else {
        for (i = 0; i < FLOAT4_VECLEN / n; i++) {
            if (gpriv->locLDName) {
                sprintf(tmp, "tmp.%s = %s.%s[%s * %lu];\n", tmpSuff[n - 1][i],
                        gpriv->srcName, dstSuff[n - 1], gpriv->locLDName, i);
            }
            else {
                sprintf(tmp, "tmp.%s = %s.%s[%lu];\n", tmpSuff[n - 1][i],
                        gpriv->srcName, dstSuff[n - 1], gpriv->lmemLD * i);
            }
            kgenAddStmt(ctx, tmp);
        }

        sprintf(tmp, "*%s.f4v++ = tmp;\n", gpriv->dstName);
        kgenAddStmt(ctx, tmp);

        s = gpriv->srcName;
    }

    if (gpriv->locLDName) {
        sprintf(tmp, "%s.%s += %s * %lu;\n", s, vfield, gpriv->locLDName, i);
    }
    else {
        sprintf(tmp, "%s.%s += %lu;\n", s, vfield, gpriv->lmemLD * i);
    }

    return kgenAddStmt(ctx, tmp);
}
示例#21
0
// generator optimizing to a subproblem size
static int
copyDBlockOptimGen(
    struct KgenContext *ctx,
    const SubproblemDim *dim,
    const PGranularity *pgran,
    GenPriv *gpriv)
{
    char fpref;
    const char varPref[2] = {'G', 'L'};
    char tmp[1024];
    // lead dimension for right and transposed local block in float words
    ItemWork work;
    LoopCtl loopCtl;
    LoopUnrollers unrollers;
    const char *s, *s1, *s2;
    bool image;
    SubproblemDim newDim;
    // copying direction within the memory or image related function group
    int gdir = 0;
    int r;

    fpref = dtypeToPrefix(gpriv->dtype);
    if (!fpref || (fpref == 'i')) {
        return -EINVAL;
    }

    image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE ||
             gpriv->dir == DBLOCK_LOCAL_TO_IMAGE);

    memset(&unrollers, 0, sizeof(unrollers));
    memset(&loopCtl, 0, sizeof(loopCtl));
    memset(&newDim, 0, sizeof(newDim));

    gpriv->dim = &newDim;
    gpriv->work = (const ItemWork*)&work;
    gpriv->globLDName = "ld";
    s = (gpriv->transp) ? "Transp" : "";
    s1 = (gpriv->conjugate) ? "Conj" : "";
    s2 = (gpriv->notVectorize) ? "Nvec" : "";

    if ((gpriv->dir == DBLOCK_LOCAL_TO_GLOBAL) && gpriv->transp) {
        // pass over columns of the block stored in the local memory
        newDim.x = dim->y;
        newDim.y = dim->x;
    }
    else {
        // pass over rows
        newDim.x = dim->x;
        newDim.y = dim->y;
    }

    getItemWork(&work, &newDim, pgran, gpriv->nfloats, gpriv->vecLen);

    if (image) {
        s = (gpriv->packed) ? "Pack" : "";
        if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
            sprintf(tmp, copyMemGImgDBlockDecl, fpref, s, dim->y, dim->x);
        }
        else {
            sprintf(tmp, copyMemLImgDBlockDecl, fpref, s, dim->y, dim->x);
        }

    }
    else {
        gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1;
        sprintf(tmp, copyMemDBlockDecl, fpref, s, s1, s2, varPref[gdir],
                varPref[1 - gdir], dim->y, dim->x, varPref[1 - gdir],
                varPref[gdir]);
    }

    kgenDeclareFunction(ctx, tmp);
    kgenBeginFuncBody(ctx);

    kgenDeclareLocalID(ctx, lidVarName, pgran);

    if (image) {
        // data for loop unrolling
        if (work.nrRows > 1) {
            gpriv->srcName = "src1";
            gpriv->dstName = "dst";
            gpriv->imgXName="x1";
            gpriv->imgYName="y1";
            if(gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
                kgenAddStmt(ctx, "GPtr src1;\n");
            }
            else if(gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
                kgenAddStmt(ctx, "LPtr src1;\n");
            }
            kgenAddStmt(ctx, "int x1, y1;\n");

            unrollers.preUnroll = copyImgPreUnroll;
            unrollers.postUnroll = copyImgPostUnroll;
        }
        else {
            gpriv->srcName = "src";
            // dst has image2d_t type here
            gpriv->dstName = "dst";
            gpriv->imgXName="x";
            gpriv->imgYName="y";
        }
    }
    else {
        if ((gpriv->nfloats != FLOAT4_VECLEN) &&
            (gpriv->transp || gpriv->conjugate)) {

            /*
             * temporary variable to transpose or conjugate non double
             * complex elements
             */
            kgenAddStmt(ctx, "float4 tmp;\n");
        }

        if (work.nrRows > 1) {
            sprintf(tmp, privatePtrs, varPref[gdir], varPref[1 - gdir]);
            kgenAddStmt(ctx, tmp);

            // data for loop unrolling
            unrollers.preUnroll = copyMemPreUnroll;
            unrollers.postUnroll = copyMemPostUnroll;
            gpriv->srcName = "src1";
            gpriv->dstName = "dst1";
        }
        else {
            gpriv->srcName = "src";
            gpriv->dstName = "dst";
        }
    }

    if ((work.nrRows > 1) || work.nrItems) {
        prepareLoop(ctx, &work, &loopCtl);
    }
    kgenAddBlankLine(ctx);
    loopCtl.inBound = (unsigned long)work.nrCols;

    // now, prepare all needed for loop unrolling

    if (image) {
        kgenAddStmt(ctx, "int x, y;\n");
        if (gpriv->packed) {
            kgenAddStmt(ctx, "int pLine, index;\n");
        }
        gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
                           FLOAT4_VECLEN / gpriv->nfloats;
        // set up starting x and y in image
        addSettingImageXYCode(ctx, "x", "y", pgran, gpriv);

        if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) {
            // set initial global pointer
            addSettingPtrCode(ctx, "src", 0, false, pgran, gpriv);
        }
        else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) {
            // set initial local pointer
            addSettingPtrCode(ctx, "src", gpriv->lmemLD, gpriv->transp,
                              pgran, gpriv);
        }

        unrollers.genSingleVec = copyImgVec;
        unrollers.genSingle = copyImgSingle;
    }
    else {
        // set initial global pointer
        s = (gdir) ? "dst" : "src";
        addSettingPtrCode(ctx, s, 0, false, pgran, gpriv);

        s = (gdir) ? "src" : "dst";

        if (!gdir && gpriv->transp) {
            gpriv->lmemLD = fl4RowWidth(dim->y, gpriv->typeSize) *
                           FLOAT4_VECLEN / gpriv->nfloats;
        }
        else {
            gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) *
                           FLOAT4_VECLEN / gpriv->nfloats;
        }

        if (gpriv->transp) {
            unrollers.genSingleVec = (gpriv->notVectorize) ? NULL :
                                                             copyMemVecTransp;
            unrollers.genSingle = copyMemSingleTransp;
        }
        else {
            unrollers.genSingleVec = (gpriv->notVectorize) ? NULL : copyMemVec;
            unrollers.genSingle = copyMemSingle;
        }

        addSettingPtrCode(ctx, s, gpriv->lmemLD, gpriv->transp,
                          pgran, gpriv);
    }
    unrollers.getVecLen = getVecLen;

    // unroll for float4 aligned data chunk
    kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv);

    /*
     * Unroll for remaining data tail.
     * Block tail reading/writing is done separately
     * when many work items process single row
     * because the compiler don't like any conditional
     * branches in loops
     */
    if ((unrollers.postUnroll == NULL) && work.tail) {
        addCopyTailCode(ctx, gpriv);
    }

    r = kgenEndFuncBody(ctx);

    return r ? -EOVERFLOW : 0;
}
示例#22
0
int
generateBufCopyFuncs(
    CopyBufFuncs *funcNames,
    struct KgenContext *ctx,
    BlasFunctionID funcID,
    const BlasGenSettings *gset,
    BufCopyHelperFlags flags)
{
    CopyPattern pattern;
    struct KgenGuard *guard;
    int ret = 0;
    MatrixRole mrole;
    bool needed[MATRIX_ROLES_NUMBER];
    KernelExtraFlags kgenFlags = gset->kextra->flags;
    DataType dtype = gset->kextra->dtype;
    const SubproblemDim *blasDim = gset->subdims;
    const PGranularity *pgran = gset->pgran;
    bool outputTails = (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));

    guard = createKgenGuard(ctx, cpyGenCallback, sizeof(CopyPattern));
    if (guard == NULL) {
        return -ENOMEM;
    }

    memset(&pattern, 0, sizeof(pattern));

    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    pattern.dtype = dtype;
    pattern.pgran = pgran;

    needed[MATRIX_A] = (flags & BCHF_MATRIX_A);
    needed[MATRIX_B] = (flags & BCHF_MATRIX_B);
    needed[MATRIX_C] = (flags & BCHF_READ_OUTPUT);

    for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) {
        if (!needed[mrole]) {
            continue;
        }

        initCopyPattern(&pattern, blasDim, kgenFlags, mrole, funcID);
        findGenerateFunction(guard, &pattern, funcNames->read[mrole],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if (flags & BCHF_WRITE_OUTPUT) {
        if (flags & BCHF_IMAGE_WRITE) {
            pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
            initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID);
            pattern.flags &= ~DBLOCK_COPY_TRANSPOSE;
        }
        else {
            pattern.dir = DBLOCK_LOCAL_TO_GLOBAL;
            initCopyPattern(&pattern, blasDim, kgenFlags, MATRIX_C, funcID);
        }
        ret = findGenerateFunction(guard, &pattern, funcNames->write,
                                   FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if (ret) {
        destroyKgenGuard(guard);

        return ret;
    }

    // reevaluate needed flags
    needed[MATRIX_A] = needed[MATRIX_A] &&
        (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_K));
    needed[MATRIX_B] = needed[MATRIX_B] &&
        (kgenFlags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K));
    needed[MATRIX_C] = needed[MATRIX_C] && outputTails;

    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) {
        if (!needed[mrole]) {
            continue;
        }

        initCopyPattern(&pattern, NULL, kgenFlags, mrole, funcID);
        findGenerateFunction(guard, &pattern,
                             funcNames->readGeneric[mrole],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if ((flags & BCHF_WRITE_OUTPUT) && outputTails) {
        if (flags & BCHF_IMAGE_WRITE) {
            pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
            initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID);
            pattern.flags &= ~DBLOCK_COPY_TRANSPOSE;
        }
        else {
            pattern.dir = DBLOCK_LOCAL_TO_GLOBAL;
            initCopyPattern(&pattern,NULL, kgenFlags, MATRIX_C, funcID);
        }
        ret = findGenerateFunction(guard, &pattern, funcNames->writeGeneric,
                                   FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    destroyKgenGuard(guard);

    return ret;
}
示例#23
0
int
tileMulGen(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    const TileMulOpts *mulOpts)
{
    char s[MAX_LENGTH];
    unsigned int vlenA, vlenB;
    unsigned int i, iend; //counters
    // size_t m, n, subK;
    int ret = 0;
    TileMulFlags mflags = mulOpts->flags;
    bool tra = ((mflags & TILEMUL_TRA) != 0);
    bool trb = ((mflags & TILEMUL_TRB) != 0);
    bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY);
    bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY);
    bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0);
    bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0);
    bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0);
    const SubproblemDim *subdims = gset->subdims;
    size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth;
    TileMulCore core = mulOpts->core;
    DataType dtype = gset->kextra->dtype;
    const KernelVarNames *varNames = &gset->varNames;
    FetchOpts fetchOpts;
    struct FetchContext *fctx = mulOpts->fctx;
    FetchAddrMode addrMode;
    FetchOptLevel foptlev;
    struct StatementBatch *batch = NULL;
    const Tile *tile;

    memset(&fetchOpts, 0, sizeof(fetchOpts));
    fetchOpts.memA = mulOpts->memA;
    fetchOpts.memB = mulOpts->memB;

    kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n");

    getVecLens(gset, &vlenA, &vlenB, NULL);

    /* check generator input values */
    ret = checkInput(gset, mulOpts);
    if (ret) {
        return ret;
    }

    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
        sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)",
                subdims[0].bwidth, subdims[1].bwidth);
        kgenBeginBranch(ctx, s);
    }

    core = checkReplaceCore(gset, core, tra, trb);
    if (((core == TILEMUL_MULADD || isComplexType(dtype)) &&
            !tra && trb)) {

        unsigned int n;
        const char *tname;

        n = commonTileSegmentLen(&gset->tileA, &gset->tileBX);
        getVectorTypeName(gset->tileA.dtype, n, &tname, NULL);

        sprintf(s,"%s sum;\n", tname);
        kgenAddStmt(ctx, s);
    }

    // FIXME: remove this kludge for backward compatibility
    if (fctx == NULL) {
        fctx = createFetchContext();
        if (fctx == NULL) {
            return -ENOMEM;
        }
        fetchOpts.mulOpts = mulOpts;
    }
    //////////////////////////////////////////////////////

    foptlev = getFetchOptLevels(fctx);

    if ((gset->flags & BGF_WHOLE_A) && internalFetchB &&
        (foptlev & FOPTLEV_MERGE_FETCHES)) {

        batch = createStmtBatch();
        if (batch == NULL) {
            ret = -ENOMEM;
            goto out;
        }
    }

    /*
     * First, disable sharing internal variables of the fetch code for
     * the first call so as the fetch generator could declares it for the
     * first matrix. And then re-enable it when invoking the fetch for
     * the other matrix if it has been actually enabled.
     */

    disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);

    /*
     * fetch elements of the matrix B, by rows or by columns depending on
     * the transposing flag
     */
    if (internalFetchB) {
        tile = &gset->tileBX;
        fetchOpts.mrole = MATRIX_B;
        fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows;
        if (batch == NULL) {
            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
            if (!ret) {
                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
            }
        }
        else {
            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
        }
    }

    fetchOpts.mrole = MATRIX_A;

    if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) {
        enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB);
    }

    if (ret) {
        goto out;
    }

    if (gset->flags & BGF_WHOLE_A) {
        tile = &gset->tileA;
        iend = (tra) ? tile->nrCols : tile->nrRows;
        fetchOpts.linesNum = iend;
        if (batch == NULL) {
            ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts);
        }
        else {
            genFetchInputTileBatch(batch, fctx, gset, &fetchOpts);
            ret = flushStmtBatch(ctx, batch);
            if (!ret) {
                ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B);
            }
        }

        if (!ret) {
            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
        }
        if (ret) {
            goto out;

        }

        // main multiplying loop
        for (i = 0; i < iend; i++) {
            if (i) {
                kgenAddBlankLine(ctx);
            }
            genMulLineOnTile(ctx, gset, mulOpts, i, true);
        }
    }
    else {
        iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y);
        fetchOpts.linesNum = 1;

        // main multiplying loop
        for (i = 0; i < iend; i++) {
            if (i) {
                kgenAddBlankLine(ctx);
                revalidateFetchContext(fctx, MATRIX_A);
            }
            // fetch elements of matrix A from single row
            fetchOpts.lineOffset = i;
            genFetchInputTile(ctx, fctx, gset, &fetchOpts);
            ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A);
            if (ret) {
                goto out;
            }
            genMulLineOnTile(ctx, gset, mulOpts, i, false);
        }
    }

    /*
     * increment K-related coordinates or pointers depending on addressing
     * mode
     */
    addrMode = getFetchAddrMode(fctx);
    if (addrMode & FETCH_ADDR_K_RELATIVE) {
        kgenAddBlankLine(ctx);
        genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth,
                         subdims[0].y, vlenA, dtype, gset->flags,
                         !tra, localA);

        genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth,
                         subdims[0].x, vlenB, dtype, gset->flags,
                         trb, localB);
    }
    else {
        if (incK && (varNames->k != NULL) && !(localA && localB)) {
            sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth);
            kgenAddStmt(ctx, s);
        }
    }

    if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) {
        kgenEndBranch(ctx, NULL); // k1 loop
    }
    ret = kgenAddStmt(ctx, "/* ---------------------- */\n");
    ret = (ret) ? -EOVERFLOW : 0;

out:
    if (batch != NULL) {
        destroyStmtBatch(batch);
    }
    if (fctx != mulOpts->fctx) {
        destroyFetchContext(fctx);
    }

    return ret;
}
示例#24
0
void
genTileCopy(
    struct KgenContext *ctx,
    const Tile *dst,
    const Tile *src,
    TileCopyOps op)
{
    char tmp[1024];
    Kstring el1, el2;
    unsigned int nrRows, nrCols;
    unsigned int incRows, incCols;
    unsigned int vlen;
    unsigned int i, j;

    nrRows = umin(dst->nrRows, src->nrRows);
    nrCols = umin(dst->nrCols, src->nrCols);
    if (dst->trans != src->trans) {
        vlen = 1;
        incRows = incCols = 1;
    }
    else {
        vlen = umin(dst->vecLen, src->vecLen);
        if (!dst->trans) {
            incRows = 1;
            incCols = umin(dst->nrCols, src->nrCols);
            incCols = umin(incCols, vlen);
        }
        else {
            incRows = umin(dst->nrRows, src->nrRows);
            incRows = umin(incRows, vlen);
            incCols = 1;
        }
    }

    for (i = 0; i < nrRows; i += incRows) {
        for (j = 0; j < nrCols; j += incCols) {
            sprintfTileElement(&el1, dst, i, j, vlen);
            sprintfTileElement(&el2, src, i, j, vlen);
            switch( op )
            {
                case TILECOPY_ASSIGN:
                    sprintf(tmp, "%s = %s;\n", el1.buf, el2.buf);
                    break;

                case TILECOPY_ADD_ASSIGN:
                    sprintf(tmp, "%s += %s;\n", el1.buf, el2.buf);
                    break;

                case TILECOPY_SUB_ASSIGN:
                    sprintf(tmp, "%s -= %s;\n", el1.buf, el2.buf);
                    break;

                case TILECOPY_MUL_ASSIGN:
                    sprintf(tmp, "%s *= %s;\n", el1.buf, el2.buf);
                    break;

                case TILECOPY_DIV_ASSIGN:
                    sprintf(tmp, "%s /= %s;\n", el1.buf, el2.buf);
                    break;

                case TILECOPY_MOD_ASSIGN:
                    sprintf(tmp, "%s %%= %s;\n", el1.buf, el2.buf);
                    break;

                default:
                    break;
            }
            kgenAddStmt(ctx, tmp);
        }
    }

    kgenAddBlankLine(ctx);
}
示例#25
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    char tmp[4096], tmp1[4096];
    char *p;
    // is the iteration over N, N at the top level
    const char *typeName;
    char fpref;
    DataType dtype = kextra->dtype;
    ssize_t ret;
    BlasGenSettings gset;
    BlkMulOpts mulOpts;
    unsigned int tsize;
    unsigned int vecLen, outVecLen;
    bool b;
    const char *outTypeName;
    unsigned int i;
    unsigned int nrRegs, regPitch;
    int tra, trb;
    char vect[2] = {'y', 'x'};

    const char *coordConstants =
        "const uint workItemM = get_global_id(0) * %lu;\n"
        "const uint workItemN = get_global_id(1) * %lu;\n"
        "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
        "uint vectK = (K + %u) / %u;\n";

    /*
     *  template for image based gemm preparation part
     *  for two dimensional work space
     */
    const char *localVariables =
        "uint k0;\n"
        "int2 coordA = (int2)(0, workItemM);\n"
        "int2 coordB = (int2)(0, workItemN);\n"
        "%s c[%u];\n\n";

    tsize = dtypeSize(dtype);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    if (isComplexType(dtype)) {
        regPitch = (unsigned int)subdims[1].x;
    }
    else {
        regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
                    sizeof(cl_float4) / tsize;
    }

    memset(&gset, 0, sizeof(gset));
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;
    initKernelVarNames(&gset.varNames, kextra->flags);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations and auxiliary functions
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    // now, generate the kernel

    sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
            typeName, typeName, typeName);
    kgenDeclareFunction(ctx, tmp);
    ret = kgenBeginFuncBody(ctx);

    // constants
    sprintf(tmp, coordConstants,
            subdims[1].y, subdims[1].x, subdims[1].y,
            vecLen - 1, vecLen);
    kgenAddStmt(ctx, tmp);

    /*
     * Calculate local buffer pitches, and then declare local
     * variables
     */
    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);

    sprintf(tmp, localVariables, outTypeName, nrRegs);
    kgenAddStmt(ctx, tmp);

    // check if offset exceeds matrix
    kgenAddStmt(ctx, "if ((workItemM >= M) ||"
                         "(workItemN >= N)) {\n"
                     "    return;\n"
                     "}\n");

    kgenAddStmt(ctx, "C += offsetC;\n");

    // zero C block
    sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
                 "    c[k0] = 0;\n"
                 "}\n\n",
            nrRegs);
    kgenAddStmt(ctx, tmp);

    // block multiplication inlined function
    sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)",
            subdims[1].bwidth / vecLen);
    kgenBeginBranch(ctx, tmp);

    mulOpts.aMobj = CLMEM_IMAGE;
    mulOpts.bMobj = CLMEM_IMAGE;
    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE;
    if (isComplexType(dtype)) {
        mulOpts.core = BLKMUL_SEPARATE_MULADD;
    }
    else {
        mulOpts.core = BLKMUL_MAD;
    }
    mulOpts.argNames.coordA = "coordA";
    mulOpts.argNames.coordB = "coordB";
    mulOpts.argNames.skewCol = "skewCol";
    mulOpts.argNames.skewRow = "skewRow";
    mulOpts.argNames.k = "k0";
    mulOpts.argNames.vectBoundK = "vectK";
    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
    if (ret) {
        destroyKgenContext(ctx);
        return -EOVERFLOW;
    }

    // update image coordinates
    sprintf(tmp, "\ncoordA.x += %lu;\n"
                 "coordB.x += %lu;\n",
            subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
    kgenAddStmt(ctx, tmp);

    kgenEndBranch(ctx, NULL);

    // reorder the given solution
    outVecLen = isComplexType(dtype) ? 1 : vecLen;
    p = tmp1;
    for (i = 0; i < regPitch / outVecLen; i++) {
        unsigned int k = (unsigned int)(subdims[1].y - 1) *
                         regPitch / outVecLen + i;

        sprintf(p,  "\n"
                    "    tmp = c[%u];\n"
                    "    for (j = %lu; j >= 0; j--) {\n"
                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
                    "    }\n"
                    "    c[%u] = tmp;\n",
                k, subdims[1].y - 2, regPitch / outVecLen,
                i, regPitch / outVecLen, i, i);
        p += strlen(p);
    }
    sprintf(tmp, "\n"
                 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
                 "    int j;\n"
                 "    %s tmp;\n"
                 "%s"
                 "}\n"
                 "\n",
                 outTypeName, tmp1);
    kgenAddStmt(ctx, tmp);

    tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A);
    trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B);
    sprintf(tmp, "coordA.%c = workItemM;\n"
                 "coordB.%c = workItemN;\n\n",
            vect[tra], vect[trb]);
    kgenAddStmt(ctx, tmp);

    // write back the tile evaluated
    generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#26
0
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    char tmp[1024];
    struct KgenContext *ctx;
    ssize_t ret;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    DataType dtype = kextra->dtype;
    KernelExtraFlags kflags = kextra->flags;
    CLBLASKernExtra extraNew;
    BlasGenSettings gset;
    TileMulOpts mulOpts;
    const char *ptrName;
    UpdateResultFlags upFlags = 0;
    TilePostFetchPrivate pfPriv;
    unsigned int l1Pans;
    bool b;
    Tile parTile;
    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
    int ldsLarge, lds_diagonal;
    bool isInline;
    TileSet tileSet;
    char copy2LDSFuncName[FUNC_NAME_MAXLEN];
    TailStatus tailStatus = 0;
    FetchAddrMode addrMode = 0;
    bool tailM = ((kflags & KEXTRA_TAILS_M) != 0);
    bool tailN = ((kflags & KEXTRA_TAILS_N) != 0);
    size_t alignK;

    if (pgran->wgDim != 1) {
        return -EINVAL;
    }

    l1Pans = (unsigned int)(subdims[0].x / subdims[1].x);

    memset(&gset, 0, sizeof(gset));
    gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS;
    memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2);
    // there is not need in block structure along K
    gset.subdims[0].bwidth = gset.subdims[1].bwidth;
    subdims = gset.subdims;

    /*
     * Since tiles are changed dynamically, e. g. in the main tilemul
     * loop they are rectangular, but at the second stage both A and B
     * tile storages are used for square tiles. One must adjust physical
     * vectorization accordindly, so as vector length might not be
     * greater than linear size of any tile
     */
    memcpy(&extraNew, kextra, sizeof(extraNew));
    extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y);
    extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y);

    gset.pgran = pgran;
    gset.kextra = &extraNew;
    initKernelVarNames(&gset.varNames);

    // multiplication options
    mulOpts.memA = CLMEM_GLOBAL_MEMORY;
    mulOpts.memB = CLMEM_GLOBAL_MEMORY;
    mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD :
                                                         TILEMUL_MULADD;
    mulOpts.postFetch = NULL;
    mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags);
    mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K;
    mulOpts.fctx = createFetchContext();
    if (mulOpts.fctx == NULL) {
        return -ENOMEM;
    }

    disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING);

    isInline = (gset.flags & BGF_EXPLICIT_INLINE);

    initTiles(&gset, &tileSet, subdims, kflags, dtype,
              PRIV_STORAGE_VARIABLE_SET);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        destroyFetchContext(mulOpts.fctx);
        return -ENOMEM;
    }

    kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n");

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    if (isComplexType(dtype)) {
        genComplexMathOperators(ctx, dtype);
    }
    if(!isInline) {
        genTileInverting(ctx, &gset, &tileSet);
    }

    if ( extraParams->ldsUse != LDS_NO_USE ) {
        SubproblemDim sdims;
        DBlockCopyFlags flags;
        unsigned int vecLen;

        if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) {
            sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor;
            sdims.y = gset.subdims[0].x;
        }
        else {
            sdims.x = gset.subdims[0].x;
            sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor;
        }

        vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B);
        flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0;
        copyDataBlockGen(ctx, &sdims, gset.pgran, dtype,
                         DBLOCK_GLOBAL_TO_LOCAL, flags);
        kgenAddBlankLine(ctx);
        kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx);
    }

    declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false,
                      true);
    kgenBeginFuncBody(ctx);

    declareLocalVariables(ctx, &gset, &parTile, extraParams);
    if (kflags & KEXTRA_A_OFF_NOT_ZERO) {
        kgenAddStmt(ctx, "A += offA;\n");
    }
    genTrxmBMatrShift(ctx, kflags, false);

    ptrName = dtypeUPtrField(dtype);

    sprintf(tmp, "uB.%s = B;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // external loop
    sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y);
    kgenBeginBranch(ctx, tmp);
    genZeroTile(ctx, &gset.tileCY);
    genSetupCoords(ctx, &gset, BLOCK_UPDATE);

    kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n");

    gset.tileA = tileSet.rectA;
    gset.tileBX = tileSet.origB;

    if (!isMatrixUpper(kflags) && tailM) {
        addrMode |= FETCH_ADDR_A_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0);
    alignK = subdims[1].bwidth;
    if (ldsLarge) {
        alignK *= extraParams->unrollingFactor;
    }

    if (ldsLarge) {
        const char *oldCoordB;
        FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE;
        bool withSkew;

        withSkew = useSkewedFetchB(&gset);
        if (!withSkew) {
            bamode |= FETCH_ADDR_B_RELATIVE;
        }
        else {
            bamode |= FETCH_ADDR_B_CYCLICAL;
        }

        setFetchAddrMode(mulOpts.fctx, bamode);

        if (tailN) {
            /*
             * Conditional branch for those items which hit into
             * matrix B with their matrix coordinates
             */
            sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x);
            kgenBeginBranch(ctx, tmp);
        }

        if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) {
            kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName);
        }
        else {
            kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName);
        }

        if (withSkew) {
            unsigned int bwidthOld;

            oldCoordB = gset.varNames.coordB;
            gset.varNames.coordB = "skewX";
            bwidthOld = gset.subdims[0].bwidth;
            gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows :
                                                       parTile.nrCols;
            gset.subdims[0].bwidth = bwidthOld;
        }

        genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK);
        genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName);
        genInternalLoopEnd(ctx);                             // loop over K

        if (withSkew) {
            gset.varNames.coordB = oldCoordB;
            setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL);
            // deliver from skew in the result before proceed to the next stage
            genTileCyclicalShift(ctx, &gset);
        }

        if (tailN) {
            kgenEndBranch(ctx, NULL);
            kgenBeginBranch(ctx, "else");
        }

        setFetchAddrMode(mulOpts.fctx, addrMode);
    }

    if (!ldsLarge || tailN) {
        genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
        if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) {
            addrMode |= FETCH_ADDR_B_CYCLICAL;
            setFetchAddrMode(mulOpts.fctx, addrMode);
        }

        if (tailN) {
            sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")");
            kgenBeginBranch(ctx, tmp);
        }

        genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK);
        tileMulGen(ctx, &gset, &mulOpts);
        genInternalLoopEnd(ctx);                             // loop over K

        if (tailN) {
            kgenEndBranch(ctx, NULL);
        }

        if (extraParams->ldsUse & LDS_USE_LARGE) {
            kgenEndBranch(ctx, NULL);
        }
    }

    sprintf(tmp, "uA.%s = A;\n\n", ptrName);
    kgenAddStmt(ctx, tmp);

    // processing tails along update dimension
    if (isMatrixUpper(kflags) &&
        ((kflags & KEXTRA_TAILS_K_LOWER) ||
          (ldsLarge && extraParams->unrolledTail))) {

        unsigned int tailChunks;

        tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ?
            extraParams->unrolledTail : 1;

        if (tailN) {
            char hitCond[1024];

            sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")");
            sprintf(tmp, "if ((currM + %lu < M) && %s)",
                    subdims[0].y, hitCond);
        }
        else {
            sprintf(tmp, "if (currM + %lu < M)", subdims[0].y);
        }
        kgenBeginBranch(ctx, tmp);

        if (kflags & KEXTRA_TAILS_K_LOWER) {
            setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL);
            setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
        }
        if (tailChunks > 1) {
            mulOpts.flags &= ~TILEMUL_NOT_INC_K;
            sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks);
            kgenBeginBranch(ctx, tmp);
        }

		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
        if (tailChunks > 1) {
            kgenEndBranch(ctx, NULL);
            mulOpts.flags |= TILEMUL_NOT_INC_K;
        }

        kgenEndBranch(ctx, NULL);
    }

    gset.tileA = tileSet.squareA;

    kgenAddStmt(ctx, "\n/*\n"
                     " * Stage 2. A part of work items multiply got result on "
                     "a respective\n"
                     " * inverted diagonal block, and the remaining ones wait. "
                     "Then they perform\n"
                     " * one step of further intermediate result evaluation as "
                     "multiplying tile by tile.\n"
                     " * It continues until the whole panel of the "
                     "matrix A is processed\n"
                     " */\n");

    // one must deal further with square blocks strictly
    gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y;

    sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y);
    kgenBeginBranch(ctx, tmp);

    if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
        sprintf(tmp, "const int bid = lid %% %u;\n\n",
                l1Pans);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Update the intermediate result multiply on the inverted diagonal tile,
     * and write back
     */
    genSetupCoords(ctx, &gset, TILE_UPDATE);

    sprintfStage2Condition(tmp, &gset, 0);
    ret = kgenBeginBranch(ctx, tmp);

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);
    upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA;
    genUpdateIntermResult(ctx, &gset, false, upFlags);

    kgenAddBlankLine(ctx);

    lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) &&
                    (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 &&
                    !(tailM || tailN) &&
                    !(upFlags & UPRES_NO_VECTORIZATION) &&
                    !isComplexType(kextra->dtype));

    /*
     * it's needed now to adjust addressing mode of A so as to don't
     * exceed the bound of A
     */
    if (tailM) {
        setFetchAddrMode(mulOpts.fctx,
                         addrMode | FETCH_ADDR_A_CYCLICAL |
                         FETCH_ADDR_K_CYCLICAL);
        extraNew.flags |= KEXTRA_TAILS_K_LOWER;
    }

    genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts);
    gset.tileBX = tileSet.bStage2;
    if (tailM) {
        setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv);
    }

    kgenAddStmt(ctx, "// Write back the given result\n");

    upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags);
    upFlags |= tailStatusToUpresFlags(tailStatus);

    if (lds_diagonal) {
       sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans);
    }

    genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags,
                                 NULL, NULL, lds_diagonal ? tmp : NULL);

    kgenEndBranch(ctx, NULL);   // multiply on the inverted tile path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    // continue the tile update
    kgenAddBlankLine(ctx);
    sprintfStage2Condition(tmp, &gset, 1);
    kgenBeginBranch(ctx, tmp);
    genCheckShiftTailB(ctx, &gset, 0, &tailStatus);
    if (lds_diagonal) {
        // TODO: add here storing to LDS as well
    }
    else {
		addrMode |= FETCH_ADDR_B_CYCLICAL;
        setFetchAddrMode(mulOpts.fctx, addrMode);
        tileMulGen(ctx, &gset, &mulOpts);
    }
    kgenEndBranch(ctx, NULL);           // tile update path
    kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE);

    kgenEndBranch(ctx, NULL);           // second stage loop

    if (isMatrixUpper(kflags)) {
        sprintf(tmp, "currM -= %lu;\n", subdims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndBranch(ctx, NULL);           // loop over M

    ret = kgenEndFuncBody(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyFetchContext(mulOpts.fctx);
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
示例#27
0
/*
 * NOTE: Before invoking this function 'tileA' must be initialized accordingly
 *       so as it stores a square tile of the matrix A.
 */
static void
genMulOnDiagonalTile(
    struct KgenContext *ctx,
    BlasGenSettings *gset,
    TileSet *tileSet,
    const TileMulOpts *mulOpts)
{
    char tmp[1024];
    FetchOpts fetchOpts;
    const SubproblemDim *dim = &gset->subdims[1];
    TilePostFetchPrivate pfPriv[2];
    TileMulOpts optsNew;
    const CLBLASKernExtra *extra = gset->kextra;
    CLBLASKernExtra extraNew;
    KernelExtraFlags kflags = extra->flags;
    Tile t;
    bool isTail;

    memset(&fetchOpts, 0, sizeof(fetchOpts));
    fetchOpts.regName = "b";
    fetchOpts.mrole = MATRIX_A;
    fetchOpts.lineOffset = 0;
    fetchOpts.linesNum = (unsigned int)dim->y;

    // setup options to multiply on the inverted tile
    memcpy(&optsNew, mulOpts, sizeof(TileMulOpts));
    optsNew.flags &= ~TILEMUL_TRB;

    kgenAddStmt(ctx, "// Fetch and invert the square tile located on the "
                     "diagonal\n");

    // The matrix B play the role of A
    t = substituteTile(&gset->tileA, &tileSet->bAsSqA);

    isTail = ((kflags & KEXTRA_TAILS_M) != 0);
    genFetchInputTile(ctx, mulOpts->fctx, gset, &fetchOpts);
    setFetchHandler(&optsNew, gset, genTrxmPostFetchZero, pfPriv);

    /*
     * There is no needs in zeroing tail along K in case of the lower
     * triangular matrix because it is in the "other" triangle which is
     * never accessed
     */
    if (isTail && !isMatrixUpper(kflags)) {
        memcpy(&extraNew, extra, sizeof(extraNew));
        extraNew.flags &= ~KEXTRA_TAILS_K_LOWER;
        gset->kextra = &extraNew;
    }
    genTrxmPostFetchZero(ctx, MATRIX_A, pfPriv);

    /*
     * One must zero the tail part of a fetched square tile
     * in order to avoid influence of the trailing trash on the resulting
     * inverted tile (evaluating proceeds from the bottom towards the top
     *                of the tile)
     */
    if (isTail) {
        genZeroTileTrash(ctx, gset, MATRIX_A, &gset->tileA);
    }

    restoreTile(&gset->tileA, &t);

    if(gset->flags & BGF_EXPLICIT_INLINE) {
        genTileInverting(ctx, gset, tileSet);
    }
    else {
        sprintf(tmp, "invertTile(%s, %s);\n\n",
                tileSet->squareA.baseName, tileSet->bAsSqA.baseName);
        kgenAddStmt(ctx, tmp);
    }

    gset->tileBX = tileSet->bAsC;
    genTileCopy(ctx, &gset->tileBX, &gset->tileCY, TILECOPY_ASSIGN);

    /*
     * For the lower diagonal not integrally decomposed matrix A
     * it's enough to zero the tail part of the result in order to
     * clear trash accumulated over the update loop
     */
    if (isTail && !isMatrixUpper(kflags)) {
        genZeroTileTrash(ctx, gset, MATRIX_B, &gset->tileBX);
    }

    genZeroTile(ctx, &gset->tileCY);

    genMulTiles(ctx, gset, &optsNew);
    kgenAddBlankLine(ctx);

    // restore original extra
    gset->kextra = extra;
}
示例#28
0
static void
genTileInverting(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    const TileSet *tileSet)
{
    char tmp[1024];
    const CLBLASKernExtra *kextra = gset->kextra;
    KernelExtraFlags kflags = kextra->flags;
    DataType dtype = kextra->dtype;
    const SubproblemDim *dim = &gset->subdims[1];
    unsigned int accLen;
    unsigned int i, j, k;
    Tile srcTile;
    Tile dstTile;
    bool isU, isComplex;
    bool isInlined = gset->flags & BGF_EXPLICIT_INLINE;
    const char* typeNameA;
    const char* typeNameB;

    memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile));
    memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile));

    getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL);
    getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL);
    isU = isMatrixUpper(kflags);
    isComplex = isComplexType(dtype);

    if (isComplex || dstTile.trans) {
        accLen = 1;
    }
    else {
        accLen = umin(srcTile.vecLen, dstTile.vecLen);
        accLen = umin(accLen, srcTile.nrCols);
    }

    if (!isInlined) {
        dstTile.baseName = "a";
        srcTile.baseName = "b";
        sprintf(tmp, "void\n"
                     "invertTile(%s *a, %s *b)\n",
                typeNameA, typeNameB);
        kgenDeclareFunction(ctx, tmp);
        kgenBeginFuncBody(ctx);
    }
    else {
        kgenAddStmt(ctx, "// Invert tile\n");
    }

    // made destination block unit
    genZeroTile(ctx, &dstTile);
    for (i = 0; i < dim->y; i++) {
        genSetUnitInTile(ctx, &dstTile, i, i);
    }
    kgenAddBlankLine(ctx);

    for (i = 0; i < dim->y; i++) {
        Kstring src, srcDiag, dst, dstLast;

        // current source diagonal element
        sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU);
        for (j = i; j < dim->y; j++) {
            // current source non diagonal element
            if (i) {
                sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU);
            }

            for (k = 0; k < dim->y; k += accLen) {
                // current updated vectorized element
                sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU);

                // update
                if (i) {
                    // last updated vectorized element
                    sprintfInvertedElement(&dstLast, &dstTile, i - 1, k,
                                           accLen, isU);
                    if (isComplex) {
                        sprintf(tmp, "%s -= mul(%s, %s);\n",
                                dst.buf, dstLast.buf, src.buf);
                    }
                    else {
                        sprintf(tmp, "%s -= %s * %s;\n",
                                dst.buf, dstLast.buf, src.buf);
                    }
                    kgenAddStmt(ctx, tmp);
                }

                // divide on the diagonal element
                if (j == i) {
                    if (isComplex) {
                        sprintf(tmp, "%s = div(%s, %s);\n",
                                dst.buf, dst.buf, srcDiag.buf);
                    }
                    else {
                        sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf);
                    }
                    kgenAddStmt(ctx, tmp);
                }
            }
        }
        if (i != dim->y - 1) {
            kgenAddBlankLine(ctx);
        }
    }

    if (!isInlined) {
        kgenEndFuncBody(ctx);
    }
    kgenAddBlankLine(ctx);

}
示例#29
0
int
genResultUpdateWithFlags(
    struct KgenContext *ctx,
    BlasFunctionID funcID,
    const BlasGenSettings *gset,
    UpdateResultFlags flags,
    const char *optFuncName,
    const char *genericFuncName,
    const char *cachedName)
{
    KernelExtraFlags kflags = gset->kextra->flags;
    UpdateResultOp op;
    char tmp[1024];
    int ret = 0;
    const char *coordY, *coordX;
    UpresVarNames uvars;
    const KernelVarNames *kvarNames = &gset->varNames;
    const SubproblemDim *dim = &gset->subdims[1];
    bool areTails, useCondition;

    memset(&uvars, 0, sizeof(uvars));

    coordX = kvarNames->coordB;
    coordY = kvarNames->coordA;

    if (funcHasTriangMatrix(funcID)) {
        if (flags & UPRES_TRIANG_WRITE_C) {
            uvars.result = "C";
        }
        else {
            uvars.result = "B";
        }
        uvars.ld = "ldb";
    }
    else {
        uvars.result = "C";
        uvars.ld = "ldc";
    }

    uvars.cachedName = cachedName;

    /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION
     * must return in case problem exceeds more precise lower level conditions
     * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result
    */
    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
    useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0);
    if (useCondition) {
        bool tailM = (kflags & KEXTRA_TAILS_M) != 0;
        bool tailN = (kflags & KEXTRA_TAILS_N) != 0;

        if (tailM) {
            if (tailN) {
                sprintf(tmp, "if ((%s < %s) && (%s < %s))",
                        coordY, kvarNames->sizeM, coordX, kvarNames->sizeN);
            }
            else {
                sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM);
            }
        }
        else {
            // here tailN is true
            sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN);
        }
        kgenBeginBranch(ctx, tmp);
    }
    else {
        kgenAddBlankLine(ctx);
    }

    if (optFuncName) {
        const char *betaStr;
        betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : "";

        // update with functions invoking
        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
            sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n",
                    optFuncName, uvars.result, coordY, coordX,
                    uvars.ld, betaStr);
        }
        else {
            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
                         "uint x = min(%luu, %s - (uint)%s);\n"

                         "if ((y == %lu) && (x == %lu)) {\n"
                         "    %s(%s, c, alpha, %s, %s, %s%s);\n"
                         "}\n"
                         "else {\n"
                         "    %s(%s, c, alpha, %s, %s, %s%s, y, x);\n"
                         "}\n",
                     dim->y, kvarNames->sizeM, coordY,
                     dim->x, kvarNames->sizeN, coordX,
                     dim->y, dim->x,
                     optFuncName, uvars.result, coordY, coordX, uvars.ld,
                     betaStr,
                     genericFuncName, uvars.result, coordY, coordX, uvars.ld,
                     betaStr);
        }

        kgenAddStmt(ctx, tmp);
    }
    else {
        // inline result update
        flags |= UPRES_INLINE;

        op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET;

        uvars.startRow = coordY;
        uvars.startCol = coordX;
        uvars.nrRows = "y";
        uvars.nrCols = "x";

        if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) {
            ret = updateResultGen(ctx,
                gset,
                funcID,
                op,
                flags,
                &uvars);
        }
        else {
            sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n"
                         "uint x = min(%luu, %s - (uint)%s);\n",
                    dim->y, kvarNames->sizeM, coordY,
                    dim->x, kvarNames->sizeN, coordX);
            kgenAddStmt(ctx, tmp);

            sprintf(tmp, "if ((y == %lu) && (x == %lu))",
                    dim->y, dim->x);
            kgenBeginBranch(ctx, tmp);

            // optimized update
            updateResultGen(ctx,
                gset,
                funcID,
                op,
                flags,
                &uvars);

            kgenEndBranch(ctx, NULL);

            kgenBeginBranch(ctx, "else ");

            // not optimized update
            flags |= UPRES_GENERIC;
            updateResultGen(ctx,
                gset,
                funcID,
                op,
                flags,
                &uvars);

            ret = kgenEndBranch(ctx, NULL);
        }
    }

    if (useCondition) {
        ret = kgenEndBranch(ctx, NULL);
    }

    return (ret) ? -EOVERFLOW : 0;
}
示例#30
0
static void
declareLocalVariables(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    Tile* parTile,
    TrsmExtraParams * extraParams)
{
    char tmp[1024];
    const SubproblemDim *dims = gset->subdims;
    const char* parTileTypeName = NULL;
    bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags,
                                   MATRIX_B);
    unsigned int locWidth;
    unsigned int tsize;
    unsigned int parTileSize;
    unsigned int l1Pans;
    unsigned int step;

    kgenAddStmt(ctx,
                 "const int lid = get_local_id(0);\n"
                 "const int gid = get_group_id(0);\n"
                 "GPtr uA, uB;\n"
                 "uint coordA, coordB;\n"
                 "uint m0 = 0, k0, m1;\n");

    if (isMatrixUpper(gset->kextra->flags)) {
        sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n",
                dims[0].y, dims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Declare private blocks.
     * The region 'b' stores in different time tiles of both
     * the input matrices and the result
     */

    declareTileStorages(ctx, gset);

    *parTile = gset->tileBX;

    if (extraParams->ldsUse) {
        tsize = dtypeSize(gset->kextra->dtype);
        l1Pans = (unsigned int)(dims[0].x / dims[1].x);

        parTile->vecLen = (trb) ? (unsigned int)dims[1].x
                                : (unsigned int)dims[1].bwidth;
        parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize);
        parTile->trans = trb;

       /*
        * Allocate enough space in the local area to fit several tiles
        * at the stage1 (according to the unrolled factor) and one tile
        * at the stage2
        */

        locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor;
        if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
            locWidth = umax(locWidth, (unsigned int)dims[1].y);
        }
        if (trb) {
            parTile->nrRows = locWidth;
            parTile->nrCols = (unsigned int)dims[0].x;
            step = (unsigned int)dims[1].x / parTile->vecLen;
        }
        else {
            parTile->nrRows = (unsigned int)dims[0].x;
            parTile->nrCols = locWidth;
            step = (unsigned int)dims[1].x * locWidth / parTile->vecLen;
        }

        parTileSize = tileVectorsNum(parTile);

        getVectorTypeName(gset->kextra->dtype, parTile->vecLen,
                          &parTileTypeName, NULL);

        sprintf(tmp, "__local %s tmpB[%i];\n"
                     "LPtr lB;\n"
                     "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n",
                parTileTypeName, parTileSize, l1Pans, step);
        kgenAddStmt(ctx, tmp);

        if (useSkewedFetchB(gset)) {
            kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n",
                       l1Pans, gset->subdims[1].x);
        }
    }

    kgenAddBlankLine(ctx);
}