// Generate control block of the loop over K static void genInternalLoopCtl( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags, size_t stepK, size_t boundAlign) { char tmp[1024]; if (isMatrixUpper(kflags)) { if (kflags & KEXTRA_TAILS_M) { sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; " "k0 += %lu)", dim[0].y, boundAlign, boundAlign, stepK); } else { sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)", dim[0].y, stepK); } } else { sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)", stepK); } kgenBeginBranch(ctx, tmp); }
static void genZeroTileTrash( struct KgenContext *ctx, const BlasGenSettings *gset, MatrixRole mrole, Tile* tile) { char tmp[1024]; const SubproblemDim *dim = &gset->subdims[1]; const CLBLASKernExtra *kextra = gset->kextra; unsigned int i, j; unsigned int step; Kstring elem; if (mrole == MATRIX_A) { kgenAddBlankLine(ctx); } else { kgenBeginBranch(ctx, NULL); } sprintf(tmp, "const int bound = (coordA + %lu > M) ? (M - coordA) : %lu;\n", dim->y, dim->y); kgenAddStmt(ctx, tmp); step = tileLineSegmentLen(tile); step = (tile->trans) ? 1 : step; for (j = 0; j < tile->nrRows; ++j) { for (i = 0; i < tile->nrCols; i+=step) { sprintfTileElement(&elem, tile, j, i, step); sprintf(tmp, "%s = (bound <= %u) ? 0 : %s;\n", elem.buf, j, elem.buf); kgenAddStmt(ctx, tmp); } } // Set units in the trash diagonal elements for a tile of A if (mrole == MATRIX_A) { for (i = 0; i < (unsigned int)dim->y; i++) { sprintfTileElement(&elem, tile, i, i, 1); sprintf(tmp, "%s = (bound <= %d) ? %s : %s;\n", elem.buf, (int)i, strOne(kextra->dtype), elem.buf); kgenAddStmt(ctx, tmp); } } if (mrole == MATRIX_A) { kgenAddBlankLine(ctx); } else { kgenEndBranch(ctx, NULL); } }
void checkGenBeginHitMatrixBlock( struct KgenContext *ctx, KernelExtraFlags kflags) { bool tailsM = (kflags & KEXTRA_TAILS_M) != 0; bool tailsN = (kflags & KEXTRA_TAILS_N) != 0; if (tailsM) { if (tailsN) { kgenBeginBranch(ctx, "if ((coord.x < N) && (coord.y < M))"); } else { kgenBeginBranch(ctx, "if (coord.y < M)"); } } else { if (tailsN) { kgenBeginBranch(ctx, "if (coord.x < N)"); } } }
/* * common function for loop tail generating */ static void addTailCode( struct KgenContext *ctx, GenPriv *gpriv, LoopUnrollGen genSingleVec, LoopUnrollGen genSingle) { char tmp[1024]; const ItemWork *work = gpriv->work; LoopCtl loopCtl; LoopUnrollers unrollers; memset(&loopCtl, 0, sizeof(loopCtl)); memset(&unrollers, 0, sizeof(unrollers)); loopCtl.inBound = (unsigned long)work->tail; if (work->itemsPerRow > 1) { if (work->nrItems) { sprintf(tmp, "if ((%s %% %u == %u) && (%s < %u))", lidVarName, work->itemsPerRow, work->itemsPerRow - 1, lidVarName, work->nrItems); } else { sprintf(tmp, "if (%s %% %u == %u)", lidVarName, work->itemsPerRow, work->itemsPerRow - 1); } kgenBeginBranch(ctx, tmp); } unrollers.genSingleVec = genSingleVec; unrollers.genSingle = genSingle; unrollers.getVecLen = getVecLen; kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv); if (work->itemsPerRow > 1) { kgenEndBranch(ctx, NULL); } }
/* * Generate cyclical tile shifting so as to convert the skewed * storing to "one-to-one", i. e. the first element in the tile * matches to the first element of the respective tile in the * output matrix. */ static void genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset) { const char *tname; Kstring k1, k2, *src, *dst, *ktmp; unsigned int row, col; unsigned int seglen; Tile *tileC = &gset->tileCY; seglen = tileLineSegmentLen(tileC); getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL); kgenAddStmt(ctx, "\n// deliver from skewing in the result\n"); kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)"); kgenPrintf(ctx, "%s tmp;\n\n", tname); src = &k1; dst = &k2; // Skewing may be used only in case of transposed C for (row = 0; row < tileC->nrRows; row += seglen) { sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen); kgenPrintf(ctx, "tmp = %s;\n", dst->buf); for (col = tileC->nrCols - 1; col > 0; col--) { sprintfTileElement(src, tileC, row, col - 1, seglen); kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf); // swap pointer ktmp = src; src = dst; dst = ktmp; } kgenPrintf(ctx, "%s = tmp;\n", dst->buf); } kgenEndBranch(ctx, NULL); kgenAddBlankLine(ctx); }
int genResultUpdateWithFlags( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, UpdateResultFlags flags, const char *optFuncName, const char *genericFuncName, const char *cachedName) { KernelExtraFlags kflags = gset->kextra->flags; UpdateResultOp op; char tmp[1024]; int ret = 0; const char *coordY, *coordX; UpresVarNames uvars; const KernelVarNames *kvarNames = &gset->varNames; const SubproblemDim *dim = &gset->subdims[1]; bool areTails, useCondition; memset(&uvars, 0, sizeof(uvars)); coordX = kvarNames->coordB; coordY = kvarNames->coordA; if (funcHasTriangMatrix(funcID)) { if (flags & UPRES_TRIANG_WRITE_C) { uvars.result = "C"; } else { uvars.result = "B"; } uvars.ld = "ldb"; } else { uvars.result = "C"; uvars.ld = "ldc"; } uvars.cachedName = cachedName; /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION * must return in case problem exceeds more precise lower level conditions * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result */ areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0); if (useCondition) { bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailN = (kflags & KEXTRA_TAILS_N) != 0; if (tailM) { if (tailN) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); } else { sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM); } } else { // here tailN is true sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN); } kgenBeginBranch(ctx, tmp); } else { kgenAddBlankLine(ctx); } if (optFuncName) { const char *betaStr; betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : ""; // update with functions invoking if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n", optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n" "if ((y == %lu) && (x == %lu)) {\n" " %s(%s, c, alpha, %s, %s, %s%s);\n" "}\n" "else {\n" " %s(%s, c, alpha, %s, %s, %s%s, y, x);\n" "}\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX, dim->y, dim->x, optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr, genericFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } kgenAddStmt(ctx, tmp); } else { // inline result update flags |= UPRES_INLINE; op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; uvars.startRow = coordY; uvars.startCol = coordX; uvars.nrRows = "y"; uvars.nrCols = "x"; if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { ret = updateResultGen(ctx, gset, funcID, op, flags, &uvars); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX); kgenAddStmt(ctx, tmp); sprintf(tmp, "if ((y == %lu) && (x == %lu))", dim->y, dim->x); kgenBeginBranch(ctx, tmp); // optimized update updateResultGen(ctx, gset, funcID, op, flags, &uvars); kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else "); // not optimized update flags |= UPRES_GENERIC; updateResultGen(ctx, gset, funcID, op, flags, &uvars); ret = kgenEndBranch(ctx, NULL); } } if (useCondition) { ret = kgenEndBranch(ctx, NULL); } return (ret) ? -EOVERFLOW : 0; }
static void genUpdateIntermResult( struct KgenContext *ctx, const BlasGenSettings *gset, bool withMhitCond, UpdateResultFlags flags) { char tmp[1024]; const char *coordY, *coordX; char *revAlp, *alp; DataType dtype = gset->kextra->dtype; KernelExtraFlags kflags = gset->kextra->flags; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *kvarNames = &gset->varNames; UpdateResultOp op; UpresVarNames uvars; const char* ctype; memset(&uvars, 0, sizeof(uvars)); op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; uvars.startRow = kvarNames->coordA; uvars.startCol = kvarNames->coordB; uvars.nrRows = "y"; uvars.nrCols = "x"; uvars.result = "B"; uvars.ld = "ldb"; ctype = dtypeBuiltinType(dtype); if (isComplexType(dtype)) { if (dtype == TYPE_COMPLEX_FLOAT) { revAlp = "div((float2)(-1.f, 0), alpha)"; alp = "(float2)(1.f, 0)"; } else { revAlp = "div((double2)(-1., 0), alpha)"; alp = "(double2)(1., 0)"; } } else { revAlp = "-1. / alpha"; alp = "1."; } // inline result update flags |= UPRES_INLINE; coordY = kvarNames->coordA; coordX = kvarNames->coordB; /* * We should be careful here. * * The non tailed case of updateResult() is rewritted. * Now update result for tailed and non tailed cases have a bit * different semantics. * * The first one produces expressions like * 'dst = dst * beta + src * alpha'. * * Here 'dst' and 'src' may be private result stored in registers or * result to be updated in the global memory. Let the first one to be * designated as tileC and the second one as matC. * * The non tailed case produces expressions like * 'dst = matC * beta + tileC * alpha'. * * The second variant is more clear and native for the new implementation. * But as the difference is not eliminated, both the variants are * maintained here. */ if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) { kgenBeginBranch(ctx, ""); sprintf(tmp, "%s %s = %s;\n" "%s alpha = beta;\n", ctype, "beta", revAlp, ctype); kgenAddStmt(ctx, tmp); updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags & ~UPRES_WITH_BETA, &uvars); kgenEndBranch(ctx, NULL); } else { if (withMhitCond) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); kgenBeginBranch(ctx, tmp); } else { /* for x, y variables scope */ kgenBeginBranch(ctx, NULL); } sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX); kgenAddStmt(ctx, tmp); sprintf(tmp, "if ((y == %lu) && (x == %lu))", dim->y, dim->x); kgenBeginBranch(ctx, tmp); sprintf(tmp, "%s %s = %s;\n" "%s alpha = beta;\n", ctype, "beta", revAlp, ctype); kgenAddStmt(ctx, tmp); // optimized update updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags & ~UPRES_WITH_BETA, &uvars); kgenEndBranch(ctx, NULL); flags |= UPRES_GENERIC; kgenBeginBranch(ctx, "else "); sprintf(tmp, "%s %s = %s;\n" "%s %s = %s;\n", ctype, "beta", revAlp, ctype, "alpha", alp); kgenAddStmt(ctx, tmp); // not optimized update updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags, &uvars); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } }
static int genLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, char *tmp) { KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bwidth = gset->subdims[1].bwidth; int ret; bool isRel = false; const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB; getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA); getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB); sprintf(tmp, "uint k0;\n"); kgenAddStmt(ctx, tmp); if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER | KEXTRA_TAILS_K_LOWER))) { FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE | FETCH_ADDR_K_RELATIVE; isRel = true; mulOpts->fctx = createFetchContext(); if (mulOpts->fctx == NULL) { return -ENOMEM; } setFetchAddrMode(mulOpts->fctx, addrMode); gset->varNames.A = "pA"; gset->varNames.B = "pB"; } else { gset->flags |= BGF_UPTRS; kgenPrintf(ctx, "GPtr Ag, Bg;\n" "\n" "Ag.%s = A;\n" "Bg.%s = B;\n\n", inPtrNameA, inPtrNameB); } if (isMatrixUpper(kflags)) { if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } sprintf(tmp, "for (k0 = kBegin; " "(k0 <= (kBegin + %luu))&&(k0 < M); " "k0 += %lu)", y0, bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); //main triangle part sprintf(tmp, "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)", y0, gset->subdims[1].bwidth); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // matrix side part // should be calculated by item0 of each subgroup sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); resetFetchNumA(mulOpts); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } else { // lower size_t diagBlocks; //Number of bw *y blocks that fit in y*y square if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } diagBlocks = divRoundUp(y0, bwidth); sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0); kgenAddStmt(ctx, tmp); sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth); kgenAddStmt(ctx, tmp); // main triangle part sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)", diagBlocks); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // diagonal part sprintf(tmp, "for (; k0 < iterK; k0++)"); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0 * %lu;\n", bwidth); // diagonal blocks part mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (isRel) { destroyFetchContext(mulOpts->fctx); mulOpts->fctx = NULL; } return 0; }
// generator working with subproblems of any dimension static int copyDBlockGenericGen( struct KgenContext *ctx, const PGranularity *pgran, GenPriv *gpriv) { char fpref; const char varPref[2] = {'G', 'L'}; char tmp[1024]; bool image; const char *s[3]; int gdir; unsigned int i, n, gsize; const char *vfield; DataType dtype = gpriv->dtype; fpref = dtypeToPrefix(dtype); if (!fpref || (fpref == 'i')) { return -EINVAL; } image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE || gpriv->dir == DBLOCK_LOCAL_TO_IMAGE); s[0] = (gpriv->transp) ? "Transp" : ""; vfield = dtypeUPtrField(dtype); n = FLOAT4_VECLEN / gpriv->nfloats; gsize = pgran->wgSize[0] * pgran->wgSize[1]; if (image) { char srcStr[1024]; s[1] = (gpriv->packed) ? "Pack" : ""; if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { sprintf(srcStr, "src.%s += (startRow + lid * n) *" " srcLD + startCol;\n", vfield); sprintf(tmp, copyMemGImgDBlockSlowDecl, fpref, s[1]); } else { sprintf(srcStr, "src.%s += srcLD * lid * n;\n", vfield); sprintf(tmp, copyMemLImgDBlockSlowDecl, fpref, s[1]); } kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); sprintf(tmp, "int x, y;\n" "uint i, j, n, jb, jv;\n" "int lsize = %u;\n", gsize); kgenAddStmt(ctx, tmp); kgenDeclareLocalID(ctx, "lid", pgran); if (gpriv->packed) { char nLinesStr[1024]; sprintf(nLinesStr, "nLines = (get_image_width(dst) - startX) * %d / nrCols;\n" "index = lid * n;\n", FLOAT4_VECLEN / gpriv->nfloats); sprintf(tmp, "int nLines, index;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n,"", nLinesStr, srcStr); } else { sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n, "", "x = startX;\n" "y = startY + lid * n;\n", srcStr); } kgenAddStmt(ctx, tmp); gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) ? 0 : 1; if (gpriv->packed) { sprintf(tmp, copyMemImgDBlockPackedSlow, varPref[gdir], FLOAT4_VECLEN / gpriv->nfloats, vfield); } else { sprintf(tmp, copyMemImgDBlockSlow, varPref[gdir], vfield); } kgenAddStmt(ctx, tmp); } else { LoopCtl loopCtl; LoopUnrollers unrollers; char buf[3][256]; memset(&loopCtl, 0, sizeof(loopCtl)); memset(&unrollers, 0, sizeof(unrollers)); s[1] = (gpriv->conjugate) ? "Conj" : ""; s[2] = (gpriv->notVectorize) ? "Nvec" : ""; gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1; sprintf(tmp, copyMemDBlockSlowDecl, fpref, s[0], s[1], s[2], varPref[gdir], varPref[1 - gdir], varPref[1 - gdir], varPref[gdir]); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenDeclareLocalID(ctx, "lid", pgran); sprintf(tmp, "int lsize = %u;\n", gsize); kgenAddStmt(ctx, tmp); if (dtype == TYPE_COMPLEX_DOUBLE) { s[0] = ""; s[1] = ""; } else { s[0] = "uint js;\n"; s[1] = (gpriv->transp || gpriv->conjugate) ? "float4 tmp;\n" : ""; } // pass over rows or columns? i = (gpriv->transp && gdir) ? 1 : 0; if (dtype == TYPE_COMPLEX_DOUBLE) { buf[0][0] = '\0'; } else { const char *boundName; // set counter bound to copy tail part, each work less than float4 boundName = (i) ? "nrRows" : "nrCols"; /* * FIXME: the kludge is introduced due to strange * runtime segfault at block transferring for another * data types. Verify it later. Now, for non float types * keep only simple loop. */ if (i && (dtype != TYPE_FLOAT)) { gpriv->notVectorize = true; } if (gpriv->notVectorize) { sprintf(buf[0], "jb = 0;\n" "jv = 0;\n" "js = %s;\n", boundName); } else { sprintf(buf[0], "js = %s - jb * %u - jv * %u;\n", boundName, 4 * n, n); } } // set initial pointers if (!gdir) { sprintf(buf[1], "src.%s += (startRow + lid * n) * srcLD + " "startCol;\n", vfield); if (gpriv->transp) { sprintf(buf[2], "dst.%s += lid * n;\n", vfield); } else { sprintf(buf[2], "dst.%s += dstLD * lid * n;\n", vfield); } } else { if (gpriv->transp) { sprintf(buf[1], "src.%s += lid * n;\n", vfield); } else { sprintf(buf[1], "src.%s += srcLD * lid * n;\n", vfield); } sprintf(buf[2], "dst.%s += (startRow + lid * n) * dstLD + " "startCol;\n", vfield); } sprintf(tmp, copyMemSlowLvars, s[0], s[1], varPref[1 - gdir], varPref[gdir]); kgenAddStmt(ctx, tmp); sprintf(tmp, copyMemDBlockSlowStart[i], 4 * n, 4 * n, n, buf[0], buf[1], buf[2]); kgenAddStmt(ctx, tmp); // prepare to loop unrolling gpriv->srcName = "src1"; gpriv->dstName = "dst1"; if (gdir) { gpriv->locLDName = "srcLD"; gpriv->globLDName = "dstLD"; } else { gpriv->locLDName = "dstLD"; gpriv->globLDName = "srcLD"; } loopCtl.ocName = "j"; if (gpriv->transp) { unrollers.genSingle = copyMemSingleTransp; if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = copyMemVecTransp; } } else { unrollers.genSingle = copyMemSingle; if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = copyMemVec; } } // external loop kgenBeginBranch(ctx, "for (i = 0; i < n; i++)"); copyMemPreUnroll(ctx, gpriv); // finally, unroll all loops unrollers.getVecLen = getVecLen; // copying with 4 float4 words if (!gpriv->notVectorize) { loopCtl.outBound.name = "jb"; loopCtl.inBound = 4 * n; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); // copying with float4 words loopCtl.outBound.name = "jv"; loopCtl.inBound = n; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); } // copying the remaining tail if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = NULL; loopCtl.outBound.name = "js"; loopCtl.inBound = 1; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); } copyMemPostUnroll(ctx, gpriv); kgenEndBranch(ctx, NULL); } return kgenEndFuncBody(ctx); }
void genUpdateIntermTrsmResult( struct KgenContext *ctx, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName, bool withMhitCond) { char tmp[1024]; const char *coordY, *coordX; char *revAlp, *alp; DataType dtype = gset->kextra->dtype; KernelExtraFlags kflags = gset->kextra->flags; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *kvarNames = &gset->varNames; if (isComplexType(dtype)) { if (dtype == TYPE_COMPLEX_FLOAT) { revAlp = "div((float2)(-1.f, 0), alpha)"; alp = "(float2)(1.f, 0)"; } else { revAlp = "div((double2)(-1., 0), alpha)"; alp = "(double2)(1., 0)"; } } else { revAlp = "-1. / alpha"; alp = "1."; } coordY = kvarNames->coordA; coordX = kvarNames->coordB; if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) { sprintf(tmp, "%s(B, c, %s, %s, %s, ldb, %s);\n", optFuncName, alp, coordY, coordX, revAlp); kgenAddStmt(ctx, tmp); } else { if (withMhitCond) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); kgenBeginBranch(ctx, tmp); } else { /* for x, y variables scope */ kgenBeginBranch(ctx, NULL); } sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n" "if ((y == %luu) && (x == %luu)) {\n" " %s(B, c, %s, %s, %s, ldb, %s);\n" "}\n" "else {\n" " %s(B, c, %s, %s, %s, ldb, %s, y, x);\n" "}\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX, dim->y, dim->x, optFuncName, alp, coordY, coordX, revAlp, genericFuncName, alp, coordY, coordX, revAlp); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } }
void genInvertingBlockFunc( struct KgenContext *ctx, size_t pitch, DataType dtype, KernelExtraFlags kflags) { char tmp[1024]; const char *ctype; ctype = dtypeBuiltinType(dtype); sprintf(tmp, "void\ninvert(__local %s *src, __local %s *dst, int lid, " "int lastRow)\n", ctype, ctype); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "int i, k;\n"); if (isComplexType(dtype)) { sprintf(tmp, "dst[lid * %lu + lid].x = 1.f;\n", pitch); } else { sprintf(tmp, "dst[lid * %lu + lid] = 1.f;\n", pitch); } kgenAddStmt(ctx, tmp); if (isMatrixUpper(kflags)) { sprintf(tmp, "for (i = lastRow - 1; i >= 0; i--)"); } else { sprintf(tmp, "for (i = 0; i < lastRow; i++)"); } kgenBeginBranch(ctx, tmp); if (isComplexType(dtype)) { sprintf(tmp, "dst[i * %lu + lid] = div(dst[i * %lu + lid], " "src[i * %lu + i]);\n", pitch, pitch, pitch); } else { sprintf(tmp, "dst[i * %lu + lid] = dst[i * %lu + lid] / " "src[i * %lu + i];\n", pitch, pitch, pitch); } kgenAddStmt(ctx, tmp); if (isMatrixUpper(kflags)) { sprintf(tmp, "for (k = 0; k < i; k++)"); } else { sprintf(tmp, "for (k = i + 1; k < %lu; k++)", pitch); } kgenBeginBranch(ctx, tmp); if (isComplexType(dtype)) { sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - " "mul(src[k * %lu + i], dst[i * %lu + lid]);\n", pitch, pitch, pitch, pitch); } else { sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - " "dst[i * %lu + lid] * src[k * %lu + i];\n", pitch, pitch, pitch, pitch); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; //yes, KEXTRA_TAILS_K because it is set if N % bw != 0 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); char tmp[4096]; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; TilePostFetchPrivate pfPriv; unsigned int vecLen = kextra->vecLen; const char *outTypeName; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; const char *typeName; size_t wgSize; //unsigned int nStep = 32; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8; unsigned int cLocal; bool isComplex = isComplexType(dtype); unsigned int nPlans; typeName = dtypeBuiltinType(dtype); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // now, generate the kernel declareGemvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "// M always denotes length of Y " "and N denotes length of X in the kernel\n"); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); cLocal = wgSize/bStep; memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.pgran = pgran; gset.kextra = kextra; gset.flags = BGF_UPTRS; initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n", gid, bStep, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < M && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); if (tailN) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "M"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth; mulOpts.flags |= TILEMUL_BW_STRIDE; } sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); if (staggered) { vnames->k = "k1"; sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } vnames->k = "k"; kgenEndBranch(ctx, NULL); /* k loop */ if (tailN) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); kgenBeginBranch(ctx, "if (k < N)"); mulOpts.flags |= TILEMUL_SKEW_B; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL; setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = subdims[1].bwidth; mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep); /* write back the results */ /* y := alpha*A*x + beta*y */ setResultPos(ctx, kflags, vnames->coordA); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
void genTest( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, bool separateFetch) { char s[1024]; Kstring kstr; char *tName, tVect[64], *ptrName; KernelVarNames *vnames = &gset->varNames; DataType dtype = gset->kextra->dtype; const SubproblemDim *subdims = gset->subdims; unsigned int vecLen = gset->kextra->vecLen; size_t m, n, k; unsigned int i, j; bool tra, trb, localA, localB, vecCoords; int ret; TileMulFlags flags = mulOpts->flags; FetchOpts fetchOpts; m = gset->subdims[1].y; n = gset->subdims[1].x; k = gset->subdims[1].bwidth; tra = ((flags & TILEMUL_TRA) != 0); trb = ((flags & TILEMUL_TRB) != 0); localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0); tVect[0] = '\0'; if (vecCoords && vecLen != 1) { sprintf(tVect, "%u", vecLen); } switch (dtype) { case TYPE_FLOAT: tName = "float"; ptrName = "f"; break; case TYPE_DOUBLE: tName = "double"; ptrName = "d"; break; case TYPE_COMPLEX_FLOAT: tName = "float2"; ptrName = "f2v"; break; case TYPE_COMPLEX_DOUBLE: tName = "double2"; ptrName = "d2v"; break; default: return; } if (vecCoords) { //Do not use GPtrs in fetching vnames->A = "A"; vnames->B = "B"; } else { vnames->A = localA ? "LAptr" : "((GPtr)A)"; vnames->B = localB ? "LBptr" : "((GPtr)B)"; } if (!localA) { vnames->lda = "lda"; } if (!localB) { vnames->ldb = "ldb"; } vnames->sizeM = "M"; vnames->sizeN = "N"; vnames->sizeK = "K"; vnames->skewA = "skewA"; vnames->skewB = "skewB"; vnames->skewK = "skewK"; vnames->coordA = "workItemM"; vnames->coordB = "workItemN"; vnames->k = "k"; kgenAddBlankLine(ctx); sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n", ITEM_WORK_M, ITEM_WORK_N); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "__kernel void\n"); sprintf(s, "%s(\n", kernelName); kgenAddStmt(ctx, s); sprintf(s," %s alpha,\n", tName); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *A,\n", tName, tVect); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *B,\n", tName, tVect); kgenAddStmt(ctx, s); kgenAddStmt(ctx, " uint M,\n" " uint N,\n" " uint K,\n"); sprintf(s, " __global %s *C,\n" " const uint iter)\n", tName); kgenAddStmt(ctx, s); kgenBeginFuncBody(ctx); sprintf(s, "uint workItemM = %lu * get_global_id(0);\n" "uint workItemN = %lu * get_global_id(1);\n", m, n); kgenAddStmt(ctx, s); if ((flags & TILEMUL_SKEW_A) != 0) { kgenAddStmt(ctx, "uint skewA = 0u;\n"); } if ((flags & TILEMUL_SKEW_B) != 0) { kgenAddStmt(ctx, "uint skewB = 0u;\n"); } if ((flags & TILEMUL_SKEW_K) != 0) { kgenAddStmt(ctx, "uint skewK = 0u;\n"); } if (localA) { sprintf(s, "__local %s LA[%lu];\n", tName, subdims[0].bwidth * subdims[0].y); kgenAddStmt(ctx, s); } else { //global A sprintf(s, "uint lda = %s;\n", tra ? "M" : "K"); kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "__local %s LB[%lu];\n", tName, subdims[0].bwidth * subdims[0].x); kgenAddStmt(ctx, s); } else { //global B sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N"); kgenAddStmt(ctx, s); } initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY); declareTileStorages(ctx, gset); if (vecCoords) { size_t ha, hb; char *str; ha = tra ? k : m; hb = trb ? n : k; if (ha > 1) { str = s; str += sprintf(str, "uint%lu ca = {0", ha); for (i = 1; i < ha; i++) { str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint ca = 0;\n"); } vnames->vectCoordA = "ca"; if (hb > 1) { str = s; str += sprintf(str, "uint%lu cb = {0", hb); for (i = 1; i < hb; i++) { str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint cb = 0;\n"); } vnames->vectCoordB = "cb"; // uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3}; // uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3}; } kgenAddBlankLine(ctx); sprintf(s, "for (int it = 0; it < iter; it++)"); kgenBeginBranch(ctx, s); if (!(localA && localB)) { kgenAddStmt(ctx, "uint k = 0;\n"); } genZeroTile(ctx, &gset->tileCY); if (vecCoords) { char *coordsA[2] = {"workItemM", "k"}; char *coordsB[2] = {"k", "workItemN"}; sprintf(s, "A += %s * (lda / %u) + %s / %u;\n", coordsA[tra], vecLen, coordsA[1 - tra], vecLen); kgenAddStmt(ctx, s); sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n", coordsB[trb], vecLen, coordsB[1 - trb], vecLen); kgenAddStmt(ctx, s); } sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, s); /* Copy data to local memory. We know that the size of matrix is the same * that the size of one block and use that. */ if (localA) { sprintf(s, "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n" "wait_group_events(1, &evA);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].y * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LAptr;\n"); if (tra) { sprintf(s, "LAptr.%s = LA + workItemM;\n", ptrName); } else { sprintf(s, "LAptr.%s = LA + workItemM * %lu;\n", ptrName, subdims[0].bwidth); } kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n" "wait_group_events(1, &evB);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].x * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LBptr;\n"); if (trb) { sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n", ptrName, subdims[0].bwidth); } else { sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName); } kgenAddStmt(ctx, s); } if (!separateFetch) { ret = tileMulGen(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); } else { Tile *tileA = &gset->tileA; Tile *tileB = &gset->tileBX; memset(&fetchOpts, 0, sizeof(fetchOpts)); if (localA) { fetchOpts.memA = CLMEM_LOCAL_MEMORY; } if (localB) { fetchOpts.memB = CLMEM_LOCAL_MEMORY; } genFillTileWithNAN(ctx, tileA); genFillTileWithNAN(ctx, tileB); if (subdims[0].bwidth != subdims[1].bwidth) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, k); kgenBeginBranch(ctx, s); } #if JUST_MULTIPLICATION for (i = 0; i < tileA->nrRows; i++) { for(j = 0; j < tileA->nrCols; j++) { sprintfTileElement(&kstr, tileA, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j); kgenAddStmt(ctx, s); } } for (i = 0; i < tileB->nrRows; i++) { for(j = 0; j < tileB->nrCols; j++) { sprintfTileElement(&kstr, tileB, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j); kgenAddStmt(ctx, s); } } #else fetchOpts.mrole = MATRIX_B; fetchOpts.lineOffset = 0; fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile b"); fetchOpts.mrole = MATRIX_A; fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows; kgenAddBlankLine(ctx); fetchOpts.lineOffset = 0; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile a"); #endif ret = genMulTiles(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); #if ! JUST_MULTIPLICATION sprintf(s, "k += %lu;\n", k); kgenAddStmt(ctx, s); #endif if (subdims[0].bwidth != subdims[1].bwidth) { kgenEndBranch(ctx, NULL); } } kgenEndBranch(ctx, NULL); // K loop kgenEndBranch(ctx, NULL); // iterations loop kgenAddBlankLine(ctx); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sprintfTileElement(&kstr, &gset->tileCY, i, j, 1); sprintf(s, "((GPtr)C).%s" "[(%d + workItemM) * N + %d + workItemN] = %s;\n", ptrName, i, j, kstr.buf); kgenAddStmt(ctx, s); } } kgenEndFuncBody(ctx); }
static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[4096]; struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; int ret; BlasGenSettings gset; TileMulOpts mulOpts; int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); unsigned int l1Pans; TilePostFetchPrivate pfPriv[2]; UpdateResultFlags upResFlags; TailStatus tailStatus; bool subgMode = false; SubgVarNames subgVNames; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // mismatching subdims define case with subgroup decomposition subgMode = ( subdims[0].bwidth != subdims[1].bwidth ); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.flags = BGF_DISTINCT_VECLEN; gset.flags |= BGF_WHOLE_A; /*FIXME: This used to be a workaround for compilation issues with dtrmm on * cpu. Normally BGF_WHOLE_A should be enabled always. But for now, * there are wrong results for non-aligned cases on CPU and there is * no workaround yet. if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) { gset.flags &= ~BGF_WHOLE_A; }*/ gset.kextra = kextra; gset.pgran = pgran; //avoid [0].bw loop //gset.subdims[0].bwidth = gset.subdims[1].bwidth; memset(pfPriv, 0, sizeof(pfPriv)); pfPriv[0].funcID = CLBLAS_TRMM; pfPriv[0].gset = &gset; if ((gset.flags & BGF_WHOLE_A) != 0) { pfPriv[0].wholeA = 1; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // For inner callback, because both callbacks use own fetchNumA memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0])); // if both matrices are accessed row-major - using subgroup pattern if ( subgMode ) { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Subgroup", true, true); gset.flags |= BGF_UPTRS; } else { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Block", true, true); } kgenBeginFuncBody(ctx); initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET); declareTileStorages(ctx, &gset); kgenAddStmt(ctx, "uint currM, currN;\n" "uint4 coord = 0; /* contains coordB, coordA, k */\n"); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); if ( subgMode ) { gset.varNames.LDS = "scratch"; // declaring variables used by subgroup mode subgVNames.itemId = "itemId"; subgVNames.subgCoord = "subgCoord"; kgenAddBlankLine( ctx ); kgenAddBlankLine(ctx); kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId ); kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord); // item ID kgenPrintf( ctx, "%s.x = get_local_id(0)%%%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup ID kgenPrintf( ctx, "%s.y = get_local_id(0)/%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup coordX kgenPrintf( ctx, "%s.x = %s.y/%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); // subgroup coordY kgenPrintf( ctx, "%s.y = %s.y%%%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); } kgenAddBlankLine(ctx); sprintf(tmp, "currN = gid * %lu;\n", subdims->x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, true); if ( subgMode ) { kgenAddStmt(ctx, "GPtr Ag = {A};\n" "GPtr Bg = {B};\n"); } l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; memset(&mulOpts, 0, sizeof(mulOpts)); mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = &pfPriv; mulOpts.flags = TILEMUL_NO_FLAGS; mulOpts.flags |= TILEMUL_EXTERN_RDECL; if ( subgMode ) { mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.flags |= TILEMUL_BW_STRIDE; } if (kflags & KEXTRA_TAILS_M_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } if (kflags & KEXTRA_TAILS_N_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } if (kflags & KEXTRA_TAILS_K_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL; } if (tra) { mulOpts.flags |= TILEMUL_TRA; } if (!trb) { mulOpts.flags |= TILEMUL_TRB; } if (isMatrixConj(kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixConj(kflags, MATRIX_B)) { mulOpts.flags |= TILEMUL_CONJB; } initKernelVarNames(&gset.varNames); if ( subgMode ) { kgenPrintf( ctx, "coord.x = currN + %s.x*%d;\n", subgVNames.subgCoord, subdims[1].x ); } else { sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); } // loop over M sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genStartPosK( ctx, subdims, kflags, subgMode ); sprintf(tmp, "coord.z = kBegin;\n"); kgenAddStmt(ctx, tmp); if ( subgMode ) { kgenPrintf(ctx, "coord.y = currM + %s.y*%d;\n", subgVNames.subgCoord, subdims[1].y); } else { sprintf( tmp, "coord.y = currM + lid / %u * %lu;\n", l1Pans, subdims[1].y ); kgenAddStmt(ctx, tmp); } genZeroTile(ctx, &gset.tileCY); checkGenBeginHitMatrixBlock(ctx, kflags); tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL); // loops along 'K' if ( subgMode ) { ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered); } else { ret = genLoopsK( ctx, &gset, &mulOpts, tmp ); } if (ret != 0) { printf("%s", buf); return ret; } checkGenEndHitMatrixBlock(ctx, kflags); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // store results // for result update - x coordinate is in elements, not in vectors checkGenRestoreTailCoords(ctx, &gset, tailStatus); upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags); upResFlags |= tailStatusToUpresFlags(tailStatus); upResFlags |= UPRES_INDEXING_WITH_CONSTANTS; upResFlags |= UPRES_TRIANG_WRITE_C; upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION; if ( subgMode ) { mergeUpdateResult( ctx, CLBLAS_TRMM, &gset, &subgVNames, upResFlags, genResultUpdateWithFlags ); } else { //checkGenBeginHitMatrixBlock(ctx, kflags); genResultUpdateWithFlags( ctx, CLBLAS_TRMM, &gset, upResFlags, NULL, NULL, NULL ); //checkGenEndHitMatrixBlock(ctx, kflags); } if (isMatrixUpper(kflags)) { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[4096], tmp1[4096]; char *p; // is the iteration over N, N at the top level const char *typeName; char fpref; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; unsigned int tsize; unsigned int vecLen, outVecLen; bool b; const char *outTypeName; unsigned int i; unsigned int nrRegs, regPitch; int tra, trb; char vect[2] = {'y', 'x'}; const char *coordConstants = "const uint workItemM = get_global_id(0) * %lu;\n" "const uint workItemN = get_global_id(1) * %lu;\n" "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" "uint vectK = (K + %u) / %u;\n"; /* * template for image based gemm preparation part * for two dimensional work space */ const char *localVariables = "uint k0;\n" "int2 coordA = (int2)(0, workItemM);\n" "int2 coordB = (int2)(0, workItemN);\n" "%s c[%u];\n\n"; tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / dtypeSize(dtype); if (isComplexType(dtype)) { regPitch = (unsigned int)subdims[1].x; } else { regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * sizeof(cl_float4) / tsize; } memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kextra->flags); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // now, generate the kernel sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // constants sprintf(tmp, coordConstants, subdims[1].y, subdims[1].x, subdims[1].y, vecLen - 1, vecLen); kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then declare local * variables */ getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, localVariables, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); // check if offset exceeds matrix kgenAddStmt(ctx, "if ((workItemM >= M) ||" "(workItemN >= N)) {\n" " return;\n" "}\n"); kgenAddStmt(ctx, "C += offsetC;\n"); // zero C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); // block multiplication inlined function sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)", subdims[1].bwidth / vecLen); kgenBeginBranch(ctx, tmp); mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_IMAGE; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } mulOpts.argNames.coordA = "coordA"; mulOpts.argNames.coordB = "coordB"; mulOpts.argNames.skewCol = "skewCol"; mulOpts.argNames.skewRow = "skewRow"; mulOpts.argNames.k = "k0"; mulOpts.argNames.vectBoundK = "vectK"; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } // update image coordinates sprintf(tmp, "\ncoordA.x += %lu;\n" "coordB.x += %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // reorder the given solution outVecLen = isComplexType(dtype) ? 1 : vecLen; p = tmp1; for (i = 0; i < regPitch / outVecLen; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * regPitch / outVecLen + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, regPitch / outVecLen, i, regPitch / outVecLen, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (k0 = 0; k0 < skewRow.y; k0++) {\n" " int j;\n" " %s tmp;\n" "%s" "}\n" "\n", outTypeName, tmp1); kgenAddStmt(ctx, tmp); tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B); sprintf(tmp, "coordA.%c = workItemM;\n" "coordB.%c = workItemN;\n\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); // write back the tile evaluated generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[1024]; struct KgenContext *ctx; ssize_t ret; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; DataType dtype = kextra->dtype; KernelExtraFlags kflags = kextra->flags; CLBLASKernExtra extraNew; BlasGenSettings gset; TileMulOpts mulOpts; const char *ptrName; UpdateResultFlags upFlags = 0; TilePostFetchPrivate pfPriv; unsigned int l1Pans; bool b; Tile parTile; TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv; int ldsLarge, lds_diagonal; bool isInline; TileSet tileSet; char copy2LDSFuncName[FUNC_NAME_MAXLEN]; TailStatus tailStatus = 0; FetchAddrMode addrMode = 0; bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); bool tailN = ((kflags & KEXTRA_TAILS_N) != 0); size_t alignK; if (pgran->wgDim != 1) { return -EINVAL; } l1Pans = (unsigned int)(subdims[0].x / subdims[1].x); memset(&gset, 0, sizeof(gset)); gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS; memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2); // there is not need in block structure along K gset.subdims[0].bwidth = gset.subdims[1].bwidth; subdims = gset.subdims; /* * Since tiles are changed dynamically, e. g. in the main tilemul * loop they are rectangular, but at the second stage both A and B * tile storages are used for square tiles. One must adjust physical * vectorization accordindly, so as vector length might not be * greater than linear size of any tile */ memcpy(&extraNew, kextra, sizeof(extraNew)); extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y); extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y); gset.pgran = pgran; gset.kextra = &extraNew; initKernelVarNames(&gset.varNames); // multiplication options mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.postFetch = NULL; mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags); mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K; mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { return -ENOMEM; } disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING); isInline = (gset.flags & BGF_EXPLICIT_INLINE); initTiles(&gset, &tileSet, subdims, kflags, dtype, PRIV_STORAGE_VARIABLE_SET); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { destroyFetchContext(mulOpts.fctx); return -ENOMEM; } kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n"); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } if(!isInline) { genTileInverting(ctx, &gset, &tileSet); } if ( extraParams->ldsUse != LDS_NO_USE ) { SubproblemDim sdims; DBlockCopyFlags flags; unsigned int vecLen; if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor; sdims.y = gset.subdims[0].x; } else { sdims.x = gset.subdims[0].x; sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor; } vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B); flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0; copyDataBlockGen(ctx, &sdims, gset.pgran, dtype, DBLOCK_GLOBAL_TO_LOCAL, flags); kgenAddBlankLine(ctx); kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx); } declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false, true); kgenBeginFuncBody(ctx); declareLocalVariables(ctx, &gset, &parTile, extraParams); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); ptrName = dtypeUPtrField(dtype); sprintf(tmp, "uB.%s = B;\n\n", ptrName); kgenAddStmt(ctx, tmp); // external loop sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genZeroTile(ctx, &gset.tileCY); genSetupCoords(ctx, &gset, BLOCK_UPDATE); kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n"); gset.tileA = tileSet.rectA; gset.tileBX = tileSet.origB; if (!isMatrixUpper(kflags) && tailM) { addrMode |= FETCH_ADDR_A_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0); alignK = subdims[1].bwidth; if (ldsLarge) { alignK *= extraParams->unrollingFactor; } if (ldsLarge) { const char *oldCoordB; FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE; bool withSkew; withSkew = useSkewedFetchB(&gset); if (!withSkew) { bamode |= FETCH_ADDR_B_RELATIVE; } else { bamode |= FETCH_ADDR_B_CYCLICAL; } setFetchAddrMode(mulOpts.fctx, bamode); if (tailN) { /* * Conditional branch for those items which hit into * matrix B with their matrix coordinates */ sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x); kgenBeginBranch(ctx, tmp); } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) { kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName); } else { kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName); } if (withSkew) { unsigned int bwidthOld; oldCoordB = gset.varNames.coordB; gset.varNames.coordB = "skewX"; bwidthOld = gset.subdims[0].bwidth; gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows : parTile.nrCols; gset.subdims[0].bwidth = bwidthOld; } genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK); genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName); genInternalLoopEnd(ctx); // loop over K if (withSkew) { gset.varNames.coordB = oldCoordB; setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL); // deliver from skew in the result before proceed to the next stage genTileCyclicalShift(ctx, &gset); } if (tailN) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); } setFetchAddrMode(mulOpts.fctx, addrMode); } if (!ldsLarge || tailN) { genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } if (tailN) { sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")"); kgenBeginBranch(ctx, tmp); } genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK); tileMulGen(ctx, &gset, &mulOpts); genInternalLoopEnd(ctx); // loop over K if (tailN) { kgenEndBranch(ctx, NULL); } if (extraParams->ldsUse & LDS_USE_LARGE) { kgenEndBranch(ctx, NULL); } } sprintf(tmp, "uA.%s = A;\n\n", ptrName); kgenAddStmt(ctx, tmp); // processing tails along update dimension if (isMatrixUpper(kflags) && ((kflags & KEXTRA_TAILS_K_LOWER) || (ldsLarge && extraParams->unrolledTail))) { unsigned int tailChunks; tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ? extraParams->unrolledTail : 1; if (tailN) { char hitCond[1024]; sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")"); sprintf(tmp, "if ((currM + %lu < M) && %s)", subdims[0].y, hitCond); } else { sprintf(tmp, "if (currM + %lu < M)", subdims[0].y); } kgenBeginBranch(ctx, tmp); if (kflags & KEXTRA_TAILS_K_LOWER) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL); setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } if (tailChunks > 1) { mulOpts.flags &= ~TILEMUL_NOT_INC_K; sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks); kgenBeginBranch(ctx, tmp); } addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); if (tailChunks > 1) { kgenEndBranch(ctx, NULL); mulOpts.flags |= TILEMUL_NOT_INC_K; } kgenEndBranch(ctx, NULL); } gset.tileA = tileSet.squareA; kgenAddStmt(ctx, "\n/*\n" " * Stage 2. A part of work items multiply got result on " "a respective\n" " * inverted diagonal block, and the remaining ones wait. " "Then they perform\n" " * one step of further intermediate result evaluation as " "multiplying tile by tile.\n" " * It continues until the whole panel of the " "matrix A is processed\n" " */\n"); // one must deal further with square blocks strictly gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y; sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y); kgenBeginBranch(ctx, tmp); if (extraParams->ldsUse & LDS_USE_DIAGONAL) { sprintf(tmp, "const int bid = lid %% %u;\n\n", l1Pans); kgenAddStmt(ctx, tmp); } /* * Update the intermediate result multiply on the inverted diagonal tile, * and write back */ genSetupCoords(ctx, &gset, TILE_UPDATE); sprintfStage2Condition(tmp, &gset, 0); ret = kgenBeginBranch(ctx, tmp); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA; genUpdateIntermResult(ctx, &gset, false, upFlags); kgenAddBlankLine(ctx); lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) && (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 && !(tailM || tailN) && !(upFlags & UPRES_NO_VECTORIZATION) && !isComplexType(kextra->dtype)); /* * it's needed now to adjust addressing mode of A so as to don't * exceed the bound of A */ if (tailM) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_A_CYCLICAL | FETCH_ADDR_K_CYCLICAL); extraNew.flags |= KEXTRA_TAILS_K_LOWER; } genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts); gset.tileBX = tileSet.bStage2; if (tailM) { setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } kgenAddStmt(ctx, "// Write back the given result\n"); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); if (lds_diagonal) { sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans); } genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags, NULL, NULL, lds_diagonal ? tmp : NULL); kgenEndBranch(ctx, NULL); // multiply on the inverted tile path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // continue the tile update kgenAddBlankLine(ctx); sprintfStage2Condition(tmp, &gset, 1); kgenBeginBranch(ctx, tmp); genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if (lds_diagonal) { // TODO: add here storing to LDS as well } else { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); } kgenEndBranch(ctx, NULL); // tile update path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); kgenEndBranch(ctx, NULL); // second stage loop if (isMatrixUpper(kflags)) { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); kgenAddStmt(ctx, tmp); } kgenEndBranch(ctx, NULL); // loop over M ret = kgenEndFuncBody(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyFetchContext(mulOpts.fctx); destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
int tileMulGen( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts) { char s[MAX_LENGTH]; unsigned int vlenA, vlenB; unsigned int i, iend; //counters // size_t m, n, subK; int ret = 0; TileMulFlags mflags = mulOpts->flags; bool tra = ((mflags & TILEMUL_TRA) != 0); bool trb = ((mflags & TILEMUL_TRB) != 0); bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0); bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0); bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0); const SubproblemDim *subdims = gset->subdims; size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth; TileMulCore core = mulOpts->core; DataType dtype = gset->kextra->dtype; const KernelVarNames *varNames = &gset->varNames; FetchOpts fetchOpts; struct FetchContext *fctx = mulOpts->fctx; FetchAddrMode addrMode; FetchOptLevel foptlev; struct StatementBatch *batch = NULL; const Tile *tile; memset(&fetchOpts, 0, sizeof(fetchOpts)); fetchOpts.memA = mulOpts->memA; fetchOpts.memB = mulOpts->memB; kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n"); getVecLens(gset, &vlenA, &vlenB, NULL); /* check generator input values */ ret = checkInput(gset, mulOpts); if (ret) { return ret; } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, s); } core = checkReplaceCore(gset, core, tra, trb); if (((core == TILEMUL_MULADD || isComplexType(dtype)) && !tra && trb)) { unsigned int n; const char *tname; n = commonTileSegmentLen(&gset->tileA, &gset->tileBX); getVectorTypeName(gset->tileA.dtype, n, &tname, NULL); sprintf(s,"%s sum;\n", tname); kgenAddStmt(ctx, s); } // FIXME: remove this kludge for backward compatibility if (fctx == NULL) { fctx = createFetchContext(); if (fctx == NULL) { return -ENOMEM; } fetchOpts.mulOpts = mulOpts; } ////////////////////////////////////////////////////// foptlev = getFetchOptLevels(fctx); if ((gset->flags & BGF_WHOLE_A) && internalFetchB && (foptlev & FOPTLEV_MERGE_FETCHES)) { batch = createStmtBatch(); if (batch == NULL) { ret = -ENOMEM; goto out; } } /* * First, disable sharing internal variables of the fetch code for * the first call so as the fetch generator could declares it for the * first matrix. And then re-enable it when invoking the fetch for * the other matrix if it has been actually enabled. */ disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); /* * fetch elements of the matrix B, by rows or by columns depending on * the transposing flag */ if (internalFetchB) { tile = &gset->tileBX; fetchOpts.mrole = MATRIX_B; fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); } } fetchOpts.mrole = MATRIX_A; if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) { enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); } if (ret) { goto out; } if (gset->flags & BGF_WHOLE_A) { tile = &gset->tileA; iend = (tra) ? tile->nrCols : tile->nrRows; fetchOpts.linesNum = iend; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); ret = flushStmtBatch(ctx, batch); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); } if (ret) { goto out; } // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); } genMulLineOnTile(ctx, gset, mulOpts, i, true); } } else { iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y); fetchOpts.linesNum = 1; // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); revalidateFetchContext(fctx, MATRIX_A); } // fetch elements of matrix A from single row fetchOpts.lineOffset = i; genFetchInputTile(ctx, fctx, gset, &fetchOpts); ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); if (ret) { goto out; } genMulLineOnTile(ctx, gset, mulOpts, i, false); } } /* * increment K-related coordinates or pointers depending on addressing * mode */ addrMode = getFetchAddrMode(fctx); if (addrMode & FETCH_ADDR_K_RELATIVE) { kgenAddBlankLine(ctx); genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth, subdims[0].y, vlenA, dtype, gset->flags, !tra, localA); genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth, subdims[0].x, vlenB, dtype, gset->flags, trb, localB); } else { if (incK && (varNames->k != NULL) && !(localA && localB)) { sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth); kgenAddStmt(ctx, s); } } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { kgenEndBranch(ctx, NULL); // k1 loop } ret = kgenAddStmt(ctx, "/* ---------------------- */\n"); ret = (ret) ? -EOVERFLOW : 0; out: if (batch != NULL) { destroyStmtBatch(batch); } if (fctx != mulOpts->fctx) { destroyFetchContext(fctx); } return ret; }
static int genSubgLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, SubgVarNames* pSubgVNames, size_t staggered) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bw1 = gset->subdims[1].bwidth; const size_t bw0 = gset->subdims[0].bwidth; // bw, that will be used for diagonal block evaluation size_t diagBw1 = getVecLen( gset, CLBLAS_TRMM, MATRIX_A ); // saving dimensions of tile A, that will be changed for // diagonal block size_t sDimA = gset->tileA.trans ? gset->tileA.nrRows: gset->tileA.nrCols; size_t sDimB = gset->tileBX.trans ? gset->tileBX.nrRows: gset->tileBX.nrCols; const CLBLASKernExtra* psKExtra = gset->kextra; CLBLASKernExtra diagKExtra; TilePostFetchPrivate postFPriv; int ret = 0; kgenPrintf( ctx, "uint k0;\n" ); kgenPrintf( ctx, "uint kMax;\n" ); // upper triangle case if (isMatrixUpper(kflags)) { // diagonal part ------------------------------------------------------ // adjust tile and kextra settings for // processing diagonal block gset->subdims[1].bwidth = diagBw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = diagBw1; } else { gset->tileA.nrCols = diagBw1; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = diagBw1; } else { gset->tileBX.nrCols = diagBw1; } memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) ); diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA? diagBw1: psKExtra->vecLenA; diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB? diagBw1: psKExtra->vecLenB; gset->kextra = (const CLBLASKernExtra*)&diagKExtra; // Process the triangle block by the 0 item // of each subgroup kgenPrintf( ctx, "// k-coordinate of the end of diagonal block\n" ); kgenPrintf( ctx, "// calculated to be aligned to bw1\n"); kgenPrintf( ctx, "kMax = kBegin + %lu + (%lu - %lu%%(kBegin+%lu));\n", y0, bw1, bw1, y0); sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId ); kgenBeginBranch( ctx, tmp ); sprintf( tmp, "for( k0=kBegin; (k0<kMax)&&(k0<M); k0+=%lu )", diagBw1 ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k ); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen( ctx, gset, mulOpts ); if( 0 != ret ){ return ret; } kgenEndBranch(ctx, NULL);// for() kgenEndBranch(ctx, NULL);// if( itemId.x == 0 ) // Restore tile and kextra settings to the // original parameters gset->subdims[1].bwidth = bw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = sDimA; } else { gset->tileA.nrCols = sDimA; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = sDimB; } else { gset->tileBX.nrCols = sDimB; } gset->kextra = psKExtra; // rectangle part ----------------------------------------------------- kgenAddBlankLine( ctx ); kgenPrintf( ctx, "k0 = kMax;\n" ); if ( kflags & KEXTRA_TAILS_K_LOWER ) { kgenPrintf( ctx, "uint alignedK = M-(M%%%lu);\n", bw1 ); } // strided access sprintf(tmp, "for ( k0 = k0+%s.x*%lu; k0 < %s; k0 += %lu )", pSubgVNames->itemId, bw1, ( kflags & KEXTRA_TAILS_K_LOWER )? "alignedK" : "M", bw0); kgenBeginBranch(ctx, tmp); // TODO: make staggered access operational with lower-K tails /*kgenPrintf( ctx, "%s = (kBegin+%d) + ( m0*64*(gid%%2) + k0 )%%(M-(kBegin+%d));\n", gset->varNames.k, diagW, diagW); */ kgenPrintf( ctx, "%s = k0;\n", gset->varNames.k ); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // rectangle tail part ------------------------------------------------ if ( kflags & KEXTRA_TAILS_K_LOWER ) { kgenAddBlankLine( ctx ); kgenPrintf( ctx, "// lower K tail is handled by item 0 of each subgroup\n"); sprintf(tmp, "if( (%s.x == 0)&&(kMax < M) )", pSubgVNames->itemId); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s = alignedK;\n", gset->varNames.k ); postFPriv.fetchNumA = 0; postFPriv.gset = gset; mulOpts->postFetch = defaultTilePostFetch; mulOpts->postFetchPriv = &postFPriv; ret = tileMulGen( ctx, gset, mulOpts ); if ( ret != 0 ) { return ret; } kgenEndBranch( ctx, NULL ); } } // lower triangle case else { // rectangle part ----------------------------------------------------- kgenPrintf( ctx, "kMax = currM - currM%%%lu;\n", bw1 ); // strided access, staggered access sprintf( tmp, "for( k0 = 0; k0 < kMax; k0 += %lu )", bw0 ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s=(k0+%s.x*%d+%d*gid)%%kMax;\n", gset->varNames.k, pSubgVNames->itemId, bw1, staggered/bw1*bw1 ); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch( ctx, NULL ); // diagonal part ------------------------------------------------------ // adjust tile and kextra settings for // processing diagonal block gset->subdims[1].bwidth = diagBw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = diagBw1; } else { gset->tileA.nrCols = diagBw1; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = diagBw1; } else { gset->tileBX.nrCols = diagBw1; } psKExtra = gset->kextra; memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) ); diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA? diagBw1: psKExtra->vecLenA; diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB? diagBw1: psKExtra->vecLenB; gset->kextra = (const CLBLASKernExtra*)&diagKExtra; // process the triangle block by the 0 item // of each subgroup sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId ); kgenBeginBranch( ctx, tmp ); sprintf( tmp, "for( k0 = kMax; (k0 < currM+%lu)&&(k0 < M); k0 += %lu )", y0, diagBw1 ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k ); mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch( ctx, NULL );// for() kgenEndBranch( ctx, NULL );// if( itemId.x == 0 ) // Restore tile and kextra settings to the // original parameters gset->subdims[1].bwidth = bw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = sDimA; } else { gset->tileA.nrCols = sDimA; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = sDimB; } else { gset->tileBX.nrCols = sDimB; } gset->kextra = psKExtra; } return 0; }