int declareOneTileStorage(struct KgenContext *ctx, const Tile *tile) { char tmp[1024]; const char *tname; int r; size_t size; getVectorTypeName(tile->dtype, tile->vecLen, &tname, NULL); size = tileVectorsNum(tile); if (tile->storType == PRIV_STORAGE_ARRAY) { sprintf(tmp, "%s %s[%lu];\n", tname, tile->baseName, size); } else { size_t i; char *p; sprintf(tmp, "%s %s0", tname, tile->baseName); p = tmp + strlen(tmp); for (i = 1; i < size; i++) { sprintf(p, ", %s%lu", tile->baseName, i); p += strlen(p); } strcpy(p, ";\n"); } r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; }
static void genPreloadedTileMul( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, const Tile *parTile, const char* copy2LDSFuncName) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; unsigned int bwidthOld; const char *oldNameB; const char *ptrName; getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName); kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } else { sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "lB = lBMain;\n\n"); mulOpts->memB = CLMEM_LOCAL_MEMORY; oldNameB = gset->varNames.B; bwidthOld = (unsigned int)gset->subdims[0].bwidth; gset->varNames.B = "lB"; gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows : parTile->nrCols; tileMulGen(ctx, gset, mulOpts); gset->varNames.B = oldNameB; gset->subdims[0].bwidth = bwidthOld; mulOpts->memB = CLMEM_GLOBAL_MEMORY; }
static void genPointerUpdate( struct KgenContext *ctx, const char *ptrName, const char *ldName, size_t bwidth, size_t bheight, unsigned int vecLen, DataType dtype, BlasGenFlags gflags, bool rowMaj, bool isLocal) { const char *uptr; Kstring tmp; const char *p; if (gflags & BGF_UPTRS) { getVectorTypeName(dtype, vecLen, NULL, &uptr); ksprintf(&tmp, "%s.%s", ptrName, uptr); p = tmp.buf; } else { p = ptrName; } if (rowMaj) { kgenPrintf(ctx, "%s += %lu;\n", p, bwidth / vecLen); } else if (isLocal) { kgenPrintf(ctx, "%s += %lu;\n", p, bwidth * (bheight / vecLen)); } else { Kstring ld; Kstring bwStr, madExpr; unsigned int scale; kstrcpy(&ld, ldName); ksprintf(&bwStr, "%lu", bwidth); scale = (gflags & BGF_LD_IN_VECTORS) ? 0 : vecLen; sprintfFastScalarMad(&madExpr, &bwStr, &ld, scale, NULL); kgenPrintf(ctx, "%s += %s;\n", p, madExpr.buf); } }
int genMulTiles( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts) { char s[32]; const CLBLASKernExtra *kextra = gset->kextra; const char *tNameIn; unsigned int i; unsigned int iend; bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0); bool trb = ((mulOpts->flags & TILEMUL_TRB) != 0); TileMulCore core; int ret; ret = checkInput(gset, mulOpts); if (ret) { return ret; } getVectorTypeName(kextra->dtype, kextra->vecLen, &tNameIn, NULL); core = checkReplaceCore(gset, mulOpts->core, tra, trb); if (((core == TILEMUL_MULADD || isComplexType(kextra->dtype)) && !tra && trb)) { sprintf(s,"%s sum;\n", tNameIn); kgenAddStmt(ctx, s); } iend = (unsigned int)((mulOpts->flags & TILEMUL_TRA) ? gset->subdims[1].bwidth : gset->subdims[1].y); for (i = 0; i < iend; i++) { genMulLineOnTile(ctx, gset, mulOpts, i, true); } // just to get state ret = kgenAddStmt(ctx, NULL); return (ret) ? -EOVERFLOW : 0; }
/* * Generate cyclical tile shifting so as to convert the skewed * storing to "one-to-one", i. e. the first element in the tile * matches to the first element of the respective tile in the * output matrix. */ static void genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset) { const char *tname; Kstring k1, k2, *src, *dst, *ktmp; unsigned int row, col; unsigned int seglen; Tile *tileC = &gset->tileCY; seglen = tileLineSegmentLen(tileC); getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL); kgenAddStmt(ctx, "\n// deliver from skewing in the result\n"); kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)"); kgenPrintf(ctx, "%s tmp;\n\n", tname); src = &k1; dst = &k2; // Skewing may be used only in case of transposed C for (row = 0; row < tileC->nrRows; row += seglen) { sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen); kgenPrintf(ctx, "tmp = %s;\n", dst->buf); for (col = tileC->nrCols - 1; col > 0; col--) { sprintfTileElement(src, tileC, row, col - 1, seglen); kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf); // swap pointer ktmp = src; src = dst; dst = ktmp; } kgenPrintf(ctx, "%s = tmp;\n", dst->buf); } kgenEndBranch(ctx, NULL); kgenAddBlankLine(ctx); }
int tileMulGen( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts) { char s[MAX_LENGTH]; unsigned int vlenA, vlenB; unsigned int i, iend; //counters // size_t m, n, subK; int ret = 0; TileMulFlags mflags = mulOpts->flags; bool tra = ((mflags & TILEMUL_TRA) != 0); bool trb = ((mflags & TILEMUL_TRB) != 0); bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0); bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0); bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0); const SubproblemDim *subdims = gset->subdims; size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth; TileMulCore core = mulOpts->core; DataType dtype = gset->kextra->dtype; const KernelVarNames *varNames = &gset->varNames; FetchOpts fetchOpts; struct FetchContext *fctx = mulOpts->fctx; FetchAddrMode addrMode; FetchOptLevel foptlev; struct StatementBatch *batch = NULL; const Tile *tile; memset(&fetchOpts, 0, sizeof(fetchOpts)); fetchOpts.memA = mulOpts->memA; fetchOpts.memB = mulOpts->memB; kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n"); getVecLens(gset, &vlenA, &vlenB, NULL); /* check generator input values */ ret = checkInput(gset, mulOpts); if (ret) { return ret; } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, s); } core = checkReplaceCore(gset, core, tra, trb); if (((core == TILEMUL_MULADD || isComplexType(dtype)) && !tra && trb)) { unsigned int n; const char *tname; n = commonTileSegmentLen(&gset->tileA, &gset->tileBX); getVectorTypeName(gset->tileA.dtype, n, &tname, NULL); sprintf(s,"%s sum;\n", tname); kgenAddStmt(ctx, s); } // FIXME: remove this kludge for backward compatibility if (fctx == NULL) { fctx = createFetchContext(); if (fctx == NULL) { return -ENOMEM; } fetchOpts.mulOpts = mulOpts; } ////////////////////////////////////////////////////// foptlev = getFetchOptLevels(fctx); if ((gset->flags & BGF_WHOLE_A) && internalFetchB && (foptlev & FOPTLEV_MERGE_FETCHES)) { batch = createStmtBatch(); if (batch == NULL) { ret = -ENOMEM; goto out; } } /* * First, disable sharing internal variables of the fetch code for * the first call so as the fetch generator could declares it for the * first matrix. And then re-enable it when invoking the fetch for * the other matrix if it has been actually enabled. */ disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); /* * fetch elements of the matrix B, by rows or by columns depending on * the transposing flag */ if (internalFetchB) { tile = &gset->tileBX; fetchOpts.mrole = MATRIX_B; fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); } } fetchOpts.mrole = MATRIX_A; if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) { enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); } if (ret) { goto out; } if (gset->flags & BGF_WHOLE_A) { tile = &gset->tileA; iend = (tra) ? tile->nrCols : tile->nrRows; fetchOpts.linesNum = iend; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); ret = flushStmtBatch(ctx, batch); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); } if (ret) { goto out; } // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); } genMulLineOnTile(ctx, gset, mulOpts, i, true); } } else { iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y); fetchOpts.linesNum = 1; // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); revalidateFetchContext(fctx, MATRIX_A); } // fetch elements of matrix A from single row fetchOpts.lineOffset = i; genFetchInputTile(ctx, fctx, gset, &fetchOpts); ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); if (ret) { goto out; } genMulLineOnTile(ctx, gset, mulOpts, i, false); } } /* * increment K-related coordinates or pointers depending on addressing * mode */ addrMode = getFetchAddrMode(fctx); if (addrMode & FETCH_ADDR_K_RELATIVE) { kgenAddBlankLine(ctx); genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth, subdims[0].y, vlenA, dtype, gset->flags, !tra, localA); genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth, subdims[0].x, vlenB, dtype, gset->flags, trb, localB); } else { if (incK && (varNames->k != NULL) && !(localA && localB)) { sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth); kgenAddStmt(ctx, s); } } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { kgenEndBranch(ctx, NULL); // k1 loop } ret = kgenAddStmt(ctx, "/* ---------------------- */\n"); ret = (ret) ? -EOVERFLOW : 0; out: if (batch != NULL) { destroyStmtBatch(batch); } if (fctx != mulOpts->fctx) { destroyFetchContext(fctx); } return ret; }
static void declareLocalVariables( struct KgenContext *ctx, const BlasGenSettings *gset, Tile* parTile, TrsmExtraParams * extraParams) { char tmp[1024]; const SubproblemDim *dims = gset->subdims; const char* parTileTypeName = NULL; bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags, MATRIX_B); unsigned int locWidth; unsigned int tsize; unsigned int parTileSize; unsigned int l1Pans; unsigned int step; kgenAddStmt(ctx, "const int lid = get_local_id(0);\n" "const int gid = get_group_id(0);\n" "GPtr uA, uB;\n" "uint coordA, coordB;\n" "uint m0 = 0, k0, m1;\n"); if (isMatrixUpper(gset->kextra->flags)) { sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n", dims[0].y, dims[0].y); kgenAddStmt(ctx, tmp); } /* * Declare private blocks. * The region 'b' stores in different time tiles of both * the input matrices and the result */ declareTileStorages(ctx, gset); *parTile = gset->tileBX; if (extraParams->ldsUse) { tsize = dtypeSize(gset->kextra->dtype); l1Pans = (unsigned int)(dims[0].x / dims[1].x); parTile->vecLen = (trb) ? (unsigned int)dims[1].x : (unsigned int)dims[1].bwidth; parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize); parTile->trans = trb; /* * Allocate enough space in the local area to fit several tiles * at the stage1 (according to the unrolled factor) and one tile * at the stage2 */ locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor; if (extraParams->ldsUse & LDS_USE_DIAGONAL) { locWidth = umax(locWidth, (unsigned int)dims[1].y); } if (trb) { parTile->nrRows = locWidth; parTile->nrCols = (unsigned int)dims[0].x; step = (unsigned int)dims[1].x / parTile->vecLen; } else { parTile->nrRows = (unsigned int)dims[0].x; parTile->nrCols = locWidth; step = (unsigned int)dims[1].x * locWidth / parTile->vecLen; } parTileSize = tileVectorsNum(parTile); getVectorTypeName(gset->kextra->dtype, parTile->vecLen, &parTileTypeName, NULL); sprintf(tmp, "__local %s tmpB[%i];\n" "LPtr lB;\n" "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n", parTileTypeName, parTileSize, l1Pans, step); kgenAddStmt(ctx, tmp); if (useSkewedFetchB(gset)) { kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n", l1Pans, gset->subdims[1].x); } } kgenAddBlankLine(ctx); }
static void genTileInverting( struct KgenContext *ctx, const BlasGenSettings *gset, const TileSet *tileSet) { char tmp[1024]; const CLBLASKernExtra *kextra = gset->kextra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; const SubproblemDim *dim = &gset->subdims[1]; unsigned int accLen; unsigned int i, j, k; Tile srcTile; Tile dstTile; bool isU, isComplex; bool isInlined = gset->flags & BGF_EXPLICIT_INLINE; const char* typeNameA; const char* typeNameB; memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile)); memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile)); getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL); getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL); isU = isMatrixUpper(kflags); isComplex = isComplexType(dtype); if (isComplex || dstTile.trans) { accLen = 1; } else { accLen = umin(srcTile.vecLen, dstTile.vecLen); accLen = umin(accLen, srcTile.nrCols); } if (!isInlined) { dstTile.baseName = "a"; srcTile.baseName = "b"; sprintf(tmp, "void\n" "invertTile(%s *a, %s *b)\n", typeNameA, typeNameB); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); } else { kgenAddStmt(ctx, "// Invert tile\n"); } // made destination block unit genZeroTile(ctx, &dstTile); for (i = 0; i < dim->y; i++) { genSetUnitInTile(ctx, &dstTile, i, i); } kgenAddBlankLine(ctx); for (i = 0; i < dim->y; i++) { Kstring src, srcDiag, dst, dstLast; // current source diagonal element sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU); for (j = i; j < dim->y; j++) { // current source non diagonal element if (i) { sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU); } for (k = 0; k < dim->y; k += accLen) { // current updated vectorized element sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU); // update if (i) { // last updated vectorized element sprintfInvertedElement(&dstLast, &dstTile, i - 1, k, accLen, isU); if (isComplex) { sprintf(tmp, "%s -= mul(%s, %s);\n", dst.buf, dstLast.buf, src.buf); } else { sprintf(tmp, "%s -= %s * %s;\n", dst.buf, dstLast.buf, src.buf); } kgenAddStmt(ctx, tmp); } // divide on the diagonal element if (j == i) { if (isComplex) { sprintf(tmp, "%s = div(%s, %s);\n", dst.buf, dst.buf, srcDiag.buf); } else { sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf); } kgenAddStmt(ctx, tmp); } } } if (i != dim->y - 1) { kgenAddBlankLine(ctx); } } if (!isInlined) { kgenEndFuncBody(ctx); } kgenAddBlankLine(ctx); }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; //yes, KEXTRA_TAILS_K because it is set if N % bw != 0 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); char tmp[4096]; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; TilePostFetchPrivate pfPriv; unsigned int vecLen = kextra->vecLen; const char *outTypeName; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; const char *typeName; size_t wgSize; //unsigned int nStep = 32; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8; unsigned int cLocal; bool isComplex = isComplexType(dtype); unsigned int nPlans; typeName = dtypeBuiltinType(dtype); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // now, generate the kernel declareGemvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "// M always denotes length of Y " "and N denotes length of X in the kernel\n"); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); cLocal = wgSize/bStep; memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.pgran = pgran; gset.kextra = kextra; gset.flags = BGF_UPTRS; initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n", gid, bStep, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < M && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); if (tailN) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "M"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth; mulOpts.flags |= TILEMUL_BW_STRIDE; } sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); if (staggered) { vnames->k = "k1"; sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } vnames->k = "k"; kgenEndBranch(ctx, NULL); /* k loop */ if (tailN) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); kgenBeginBranch(ctx, "if (k < N)"); mulOpts.flags |= TILEMUL_SKEW_B; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL; setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = subdims[1].bwidth; mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep); /* write back the results */ /* y := alpha*A*x + beta*y */ setResultPos(ctx, kflags, vnames->coordA); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
static int genLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, char *tmp) { KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bwidth = gset->subdims[1].bwidth; int ret; bool isRel = false; const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB; getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA); getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB); sprintf(tmp, "uint k0;\n"); kgenAddStmt(ctx, tmp); if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER | KEXTRA_TAILS_K_LOWER))) { FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE | FETCH_ADDR_K_RELATIVE; isRel = true; mulOpts->fctx = createFetchContext(); if (mulOpts->fctx == NULL) { return -ENOMEM; } setFetchAddrMode(mulOpts->fctx, addrMode); gset->varNames.A = "pA"; gset->varNames.B = "pB"; } else { gset->flags |= BGF_UPTRS; kgenPrintf(ctx, "GPtr Ag, Bg;\n" "\n" "Ag.%s = A;\n" "Bg.%s = B;\n\n", inPtrNameA, inPtrNameB); } if (isMatrixUpper(kflags)) { if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } sprintf(tmp, "for (k0 = kBegin; " "(k0 <= (kBegin + %luu))&&(k0 < M); " "k0 += %lu)", y0, bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); //main triangle part sprintf(tmp, "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)", y0, gset->subdims[1].bwidth); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // matrix side part // should be calculated by item0 of each subgroup sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); resetFetchNumA(mulOpts); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } else { // lower size_t diagBlocks; //Number of bw *y blocks that fit in y*y square if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } diagBlocks = divRoundUp(y0, bwidth); sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0); kgenAddStmt(ctx, tmp); sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth); kgenAddStmt(ctx, tmp); // main triangle part sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)", diagBlocks); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // diagonal part sprintf(tmp, "for (; k0 < iterK; k0++)"); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0 * %lu;\n", bwidth); // diagonal blocks part mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (isRel) { destroyFetchContext(mulOpts->fctx); mulOpts->fctx = NULL; } return 0; }