static int subgGetPerf( unsigned int kflags, const void *args ) { DUMMY_ARG_USAGE(args); if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){ return PPERF_GOOD; } return PPERF_NOT_SUPPORTED; }
static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBlasKargs *kargs = (CLBlasKargs*)args; KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags; const size_t nChans = 8; // !!!DEVICE DEPENDED!!! const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! const size_t sizeType[] = {1,2,2,4}; size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype]; size_t off = kargs->K % sizeBlock; extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; if (off == 0 && !isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { /* * FIXME: staggered access is not enabled now since for some reason * it leads to slowdown at small sizes */ extraData->staggered = 0; // wideChans / sizeType[kargs->dtype]; } else { extraData->staggered = 0; } (void)subdims; off = (kargs->offsetM) ? kargs->offsetM : kargs->offsetN; if (off) { if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { kargs->offA += off; } else { kargs->offA += off * kargs->lda.matrix; } if (kargs->ldc.vector < 0) { // K store the original height of the matrix A kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector); } else { kargs->offCY += off * kargs->ldc.vector; } } kargs->offsetM = kargs->offsetN = 0; }
static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "imgA"; kvars->B = "imgB"; if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "K"; }
static bool useSkewedFetchB(const BlasGenSettings *gset) { KernelExtraFlags kflags = gset->kextra->flags; TrsmExtraParams *extraParams = (TrsmExtraParams*)gset->kextra->solverPriv; bool ret = false; if (extraParams->ldsUse & LDS_USE_LARGE) { ret = !isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); } return ret; }
static void genPreloadedTileMul( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, const Tile *parTile, const char* copy2LDSFuncName) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; unsigned int bwidthOld; const char *oldNameB; const char *ptrName; getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName); kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } else { sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "lB = lBMain;\n\n"); mulOpts->memB = CLMEM_LOCAL_MEMORY; oldNameB = gset->varNames.B; bwidthOld = (unsigned int)gset->subdims[0].bwidth; gset->varNames.B = "lB"; gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows : parTile->nrCols; tileMulGen(ctx, gset, mulOpts); gset->varNames.B = oldNameB; gset->subdims[0].bwidth = bwidthOld; mulOpts->memB = CLMEM_GLOBAL_MEMORY; }
UpdateResultFlags kextraToUpresFlags(BlasFunctionID funcID, KernelExtraFlags kflags) { UpdateResultFlags uf = 0; if (funcHasBeta(funcID) && !(kflags & KEXTRA_BETA_ZERO)) { uf |= UPRES_WITH_BETA; } if (isMatrixAccessColMaj(funcID, kflags, MATRIX_C)) { uf |= UPRES_COLUMN_MAJOR; } if (kflags & KEXTRA_NO_COPY_VEC_C) { uf |= UPRES_NO_VECTORIZATION; } return uf; }
/* * Checks current dimensionality on a validity */ bool VISIBILITY_HIDDEN isSubDimValid(SubDimInfo* sd) { int j; size_t wgX = sd->pgran.wgSize[0]; size_t wgY = sd->pgran.wgSize[1]; SubproblemDim l0 = sd->sdim[0]; SubproblemDim l1 = sd->sdim[1]; size_t dataTypeSize = getDataTypeSize(sd->dtype); size_t dataFloatSize = getDataTypeSize(TYPE_FLOAT); int maxRegistr = 64; bool ret = true; bool inv; IgnoreItem* ii = sd->first; // if pattern-based validation is available if( NULL != sd->pattern->sops->checkCalcDecomp ){ return sd->pattern->sops->checkCalcDecomp( &sd->pgran, sd->sdim, 2, sd->dtype, PGRAN_CHECK ); } ret = ret && (l1.y >= 4*dataFloatSize/dataTypeSize); if (sd->blasLevel == 3) { if (!isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_A) || !isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_B)) { /* Avoid small bwidth and big x0, y0 for cases other than * column major access to both matrixes */ ret = ret && (l1.bwidth >= 4*dataFloatSize/dataTypeSize); ret = ret && (l0.y < 128); ret = ret && (l0.x < 128); } } if ( 0 == l1.bwidth ){ return false; } else{ ret = ret && ((l0.bwidth % l1.bwidth) == 0); ret = ret && (wgX*wgY == 64); } //ret = ret && (wgX*wgY < sd->workGroupSizes); //ret = ret && (wgX*wgY > 16); if (sd->blasLevel == 2) { ret = ret && (l0.y > l1.y); } else { ret = ret && (l0.x > l1.x); ret = ret && (l0.y > l1.y); ret = ret && (l1.x >= 4*dataFloatSize/dataTypeSize); } if (sd->is2D) { bool r = ret; ret = ret && (wgY * l1.itemX == l0.x); ret = ret && (wgX * l1.itemY == l0.y); if (r != ret) { return ret; } } if (ret && sd->isSquareBlock) { ret = ret && (l0.x == l0.y && l0.x == l0.bwidth); } //if (!(isLdsUsed(sd->pattern) || (sd->isSquareBlock && sd->nrLevel == 2))) { // ret = ret && l0.bwidth == l1.bwidth; //} if (ret) { int r ; r = (int)(l1.x*l1.bwidth + l1.y*l1.bwidth + l1.x*l1.y); r = r * (int)dataTypeSize / sizeof(cl_float4); if (r > maxRegistr) { return false; } } if (ret && sd->pattern->sops->isFitToLDS != NULL) { bool isFitToLDS; CLBlasKargs args; convKExtraFlagToArg(sd->flag, &args); isFitToLDS = sd->pattern->sops->isFitToLDS(sd->sdim, sd->dtype, sd->ldsSize, &args); if (!isFitToLDS) return false; } // Skip ignored dimension for (;ii != NULL; ii = ii->next) { inv = true; for(j = 0; j < V_COUNT; ++j) { int v1 = ii->var[j]; int v2 = get(&sd->var[j]); if (v1 == -1) { continue; } if (v1 == v2) { continue; } inv = false; break; } if (inv) { ret = false; } } return ret; }
void initDefaultTiles( BlasGenSettings *gset, BlasFunctionID funcID, TileCreationFlags flags, PrivateStorageType storType) { const SubproblemDim *dim = &gset->subdims[1]; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; Tile *tile; const char *name; int level; bool packed; level = funcBlasLevel(funcID); packed = ((flags & TILE_PACKED) != 0); tile = &gset->tileA; selectTileBaseName(tile, "a"); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->bwidth, 1, dtype, storType, false, packed); tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); if (!(gset->flags & BGF_WHOLE_A)) { if (tile->trans) { tile->nrCols = 1; } else { tile->nrRows = 1; } } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_A); tile = &gset->tileBX; name = (level == 2) ? "x" : "b"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->bwidth, (unsigned int)dim->x, 1, dtype, storType, false, packed); /* * NOTE: Tiles for the level 2 functions are forced to be transposed * in order to allow user to fetch elements belonging to different * rows which is very useful in case of unit increment between * elements because provides faster access to the global memory. */ if (level == 2) { tile->trans = true; } else { tile->trans = !isMatrixAccessColMaj(funcID, kflags, MATRIX_B); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_B); tile = &gset->tileCY; name = (level == 2) ? "y" : "c"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->x, 1, dtype, storType, false, packed); if (level == 2) { tile->trans = true; } else if (!(flags & TILE_C_FORCE_NOTRANS)) { tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_C); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_C); // FIXME: remove the restriction /*if (isComplexType(tile->dtype)) { tile->vecLen = 1; }*/ }
static void initTiles( BlasGenSettings* gset, TileSet* tileSet, const struct SubproblemDim *subdims, KernelExtraFlags kflags, DataType dtype, PrivateStorageType storType) { unsigned int rowsA; unsigned int rowsB; unsigned int rowsC; unsigned int colsA; unsigned int colsB; unsigned int colsC; bool transA; bool transB; unsigned int vecLenA; unsigned int vecLenB; unsigned int vecLenC; rowsA = (unsigned int)subdims[1].y; colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y); rowsC = (unsigned int)subdims[1].y; colsC = (unsigned int)subdims[1].x; transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth); vecLenA = umin(vecLenA, MAX_TILE_VECLEN); vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth); vecLenB = umin(vecLenB, MAX_TILE_VECLEN); vecLenC = (transB) ? vecLenB : vecLenA; initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].bwidth, vecLenA, dtype, storType, transA, false); initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenA, dtype, storType, transA, false); initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenB, dtype, storType, transA, false); initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, gset->tileCY.trans, false); initTile(&gset->tileA, "a", rowsA, colsA, vecLenA, dtype, storType, transA, false); initTile(&gset->tileBX, "b", rowsB, colsB, vecLenB, dtype, storType, !transB, false); initTile(&gset->tileCY, "c", rowsC, colsC, vecLenC, dtype, storType, !transB, false); tileSet->A = gset->tileA; tileSet->B = gset->tileBX; }
static void declareLocalVariables( struct KgenContext *ctx, const BlasGenSettings *gset, Tile* parTile, TrsmExtraParams * extraParams) { char tmp[1024]; const SubproblemDim *dims = gset->subdims; const char* parTileTypeName = NULL; bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags, MATRIX_B); unsigned int locWidth; unsigned int tsize; unsigned int parTileSize; unsigned int l1Pans; unsigned int step; kgenAddStmt(ctx, "const int lid = get_local_id(0);\n" "const int gid = get_group_id(0);\n" "GPtr uA, uB;\n" "uint coordA, coordB;\n" "uint m0 = 0, k0, m1;\n"); if (isMatrixUpper(gset->kextra->flags)) { sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n", dims[0].y, dims[0].y); kgenAddStmt(ctx, tmp); } /* * Declare private blocks. * The region 'b' stores in different time tiles of both * the input matrices and the result */ declareTileStorages(ctx, gset); *parTile = gset->tileBX; if (extraParams->ldsUse) { tsize = dtypeSize(gset->kextra->dtype); l1Pans = (unsigned int)(dims[0].x / dims[1].x); parTile->vecLen = (trb) ? (unsigned int)dims[1].x : (unsigned int)dims[1].bwidth; parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize); parTile->trans = trb; /* * Allocate enough space in the local area to fit several tiles * at the stage1 (according to the unrolled factor) and one tile * at the stage2 */ locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor; if (extraParams->ldsUse & LDS_USE_DIAGONAL) { locWidth = umax(locWidth, (unsigned int)dims[1].y); } if (trb) { parTile->nrRows = locWidth; parTile->nrCols = (unsigned int)dims[0].x; step = (unsigned int)dims[1].x / parTile->vecLen; } else { parTile->nrRows = (unsigned int)dims[0].x; parTile->nrCols = locWidth; step = (unsigned int)dims[1].x * locWidth / parTile->vecLen; } parTileSize = tileVectorsNum(parTile); getVectorTypeName(gset->kextra->dtype, parTile->vecLen, &parTileTypeName, NULL); sprintf(tmp, "__local %s tmpB[%i];\n" "LPtr lB;\n" "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n", parTileTypeName, parTileSize, l1Pans, step); kgenAddStmt(ctx, tmp); if (useSkewedFetchB(gset)) { kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n", l1Pans, gset->subdims[1].x); } } kgenAddBlankLine(ctx); }
static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[4096]; struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; int ret; BlasGenSettings gset; TileMulOpts mulOpts; int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); unsigned int l1Pans; TilePostFetchPrivate pfPriv[2]; UpdateResultFlags upResFlags; TailStatus tailStatus; bool subgMode = false; SubgVarNames subgVNames; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // mismatching subdims define case with subgroup decomposition subgMode = ( subdims[0].bwidth != subdims[1].bwidth ); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.flags = BGF_DISTINCT_VECLEN; gset.flags |= BGF_WHOLE_A; /*FIXME: This used to be a workaround for compilation issues with dtrmm on * cpu. Normally BGF_WHOLE_A should be enabled always. But for now, * there are wrong results for non-aligned cases on CPU and there is * no workaround yet. if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) { gset.flags &= ~BGF_WHOLE_A; }*/ gset.kextra = kextra; gset.pgran = pgran; //avoid [0].bw loop //gset.subdims[0].bwidth = gset.subdims[1].bwidth; memset(pfPriv, 0, sizeof(pfPriv)); pfPriv[0].funcID = CLBLAS_TRMM; pfPriv[0].gset = &gset; if ((gset.flags & BGF_WHOLE_A) != 0) { pfPriv[0].wholeA = 1; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // For inner callback, because both callbacks use own fetchNumA memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0])); // if both matrices are accessed row-major - using subgroup pattern if ( subgMode ) { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Subgroup", true, true); gset.flags |= BGF_UPTRS; } else { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Block", true, true); } kgenBeginFuncBody(ctx); initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET); declareTileStorages(ctx, &gset); kgenAddStmt(ctx, "uint currM, currN;\n" "uint4 coord = 0; /* contains coordB, coordA, k */\n"); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); if ( subgMode ) { gset.varNames.LDS = "scratch"; // declaring variables used by subgroup mode subgVNames.itemId = "itemId"; subgVNames.subgCoord = "subgCoord"; kgenAddBlankLine( ctx ); kgenAddBlankLine(ctx); kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId ); kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord); // item ID kgenPrintf( ctx, "%s.x = get_local_id(0)%%%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup ID kgenPrintf( ctx, "%s.y = get_local_id(0)/%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup coordX kgenPrintf( ctx, "%s.x = %s.y/%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); // subgroup coordY kgenPrintf( ctx, "%s.y = %s.y%%%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); } kgenAddBlankLine(ctx); sprintf(tmp, "currN = gid * %lu;\n", subdims->x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, true); if ( subgMode ) { kgenAddStmt(ctx, "GPtr Ag = {A};\n" "GPtr Bg = {B};\n"); } l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; memset(&mulOpts, 0, sizeof(mulOpts)); mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = &pfPriv; mulOpts.flags = TILEMUL_NO_FLAGS; mulOpts.flags |= TILEMUL_EXTERN_RDECL; if ( subgMode ) { mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.flags |= TILEMUL_BW_STRIDE; } if (kflags & KEXTRA_TAILS_M_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } if (kflags & KEXTRA_TAILS_N_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } if (kflags & KEXTRA_TAILS_K_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL; } if (tra) { mulOpts.flags |= TILEMUL_TRA; } if (!trb) { mulOpts.flags |= TILEMUL_TRB; } if (isMatrixConj(kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixConj(kflags, MATRIX_B)) { mulOpts.flags |= TILEMUL_CONJB; } initKernelVarNames(&gset.varNames); if ( subgMode ) { kgenPrintf( ctx, "coord.x = currN + %s.x*%d;\n", subgVNames.subgCoord, subdims[1].x ); } else { sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); } // loop over M sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genStartPosK( ctx, subdims, kflags, subgMode ); sprintf(tmp, "coord.z = kBegin;\n"); kgenAddStmt(ctx, tmp); if ( subgMode ) { kgenPrintf(ctx, "coord.y = currM + %s.y*%d;\n", subgVNames.subgCoord, subdims[1].y); } else { sprintf( tmp, "coord.y = currM + lid / %u * %lu;\n", l1Pans, subdims[1].y ); kgenAddStmt(ctx, tmp); } genZeroTile(ctx, &gset.tileCY); checkGenBeginHitMatrixBlock(ctx, kflags); tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL); // loops along 'K' if ( subgMode ) { ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered); } else { ret = genLoopsK( ctx, &gset, &mulOpts, tmp ); } if (ret != 0) { printf("%s", buf); return ret; } checkGenEndHitMatrixBlock(ctx, kflags); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // store results // for result update - x coordinate is in elements, not in vectors checkGenRestoreTailCoords(ctx, &gset, tailStatus); upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags); upResFlags |= tailStatusToUpresFlags(tailStatus); upResFlags |= UPRES_INDEXING_WITH_CONSTANTS; upResFlags |= UPRES_TRIANG_WRITE_C; upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION; if ( subgMode ) { mergeUpdateResult( ctx, CLBLAS_TRMM, &gset, &subgVNames, upResFlags, genResultUpdateWithFlags ); } else { //checkGenBeginHitMatrixBlock(ctx, kflags); genResultUpdateWithFlags( ctx, CLBLAS_TRMM, &gset, upResFlags, NULL, NULL, NULL ); //checkGenEndHitMatrixBlock(ctx, kflags); } if (isMatrixUpper(kflags)) { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[4096], tmp1[4096]; char *p; // is the iteration over N, N at the top level const char *typeName; char fpref; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; unsigned int tsize; unsigned int vecLen, outVecLen; bool b; const char *outTypeName; unsigned int i; unsigned int nrRegs, regPitch; int tra, trb; char vect[2] = {'y', 'x'}; const char *coordConstants = "const uint workItemM = get_global_id(0) * %lu;\n" "const uint workItemN = get_global_id(1) * %lu;\n" "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" "uint vectK = (K + %u) / %u;\n"; /* * template for image based gemm preparation part * for two dimensional work space */ const char *localVariables = "uint k0;\n" "int2 coordA = (int2)(0, workItemM);\n" "int2 coordB = (int2)(0, workItemN);\n" "%s c[%u];\n\n"; tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / dtypeSize(dtype); if (isComplexType(dtype)) { regPitch = (unsigned int)subdims[1].x; } else { regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * sizeof(cl_float4) / tsize; } memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kextra->flags); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // now, generate the kernel sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // constants sprintf(tmp, coordConstants, subdims[1].y, subdims[1].x, subdims[1].y, vecLen - 1, vecLen); kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then declare local * variables */ getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, localVariables, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); // check if offset exceeds matrix kgenAddStmt(ctx, "if ((workItemM >= M) ||" "(workItemN >= N)) {\n" " return;\n" "}\n"); kgenAddStmt(ctx, "C += offsetC;\n"); // zero C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); // block multiplication inlined function sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)", subdims[1].bwidth / vecLen); kgenBeginBranch(ctx, tmp); mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_IMAGE; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } mulOpts.argNames.coordA = "coordA"; mulOpts.argNames.coordB = "coordB"; mulOpts.argNames.skewCol = "skewCol"; mulOpts.argNames.skewRow = "skewRow"; mulOpts.argNames.k = "k0"; mulOpts.argNames.vectBoundK = "vectK"; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } // update image coordinates sprintf(tmp, "\ncoordA.x += %lu;\n" "coordB.x += %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // reorder the given solution outVecLen = isComplexType(dtype) ? 1 : vecLen; p = tmp1; for (i = 0; i < regPitch / outVecLen; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * regPitch / outVecLen + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, regPitch / outVecLen, i, regPitch / outVecLen, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (k0 = 0; k0 < skewRow.y; k0++) {\n" " int j;\n" " %s tmp;\n" "%s" "}\n" "\n", outTypeName, tmp1); kgenAddStmt(ctx, tmp); tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B); sprintf(tmp, "coordA.%c = workItemM;\n" "coordB.%c = workItemN;\n\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); // write back the tile evaluated generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
int generateImageCopyFuncs( CopyImgFuncs *copyFuncs, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset) { const SubproblemDim *dims = gset->subdims; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; const PGranularity *pgran = gset->pgran; CopyPattern pattern; // mandatory flags for global to local copying DBlockCopyFlags glcpFlags[2] = {0, 0}; struct KgenGuard *guard; unsigned int tsize; int ret = 0; bool isTra, areTails, isConjA; bool customize; if (kflags & KEXTRA_NO_COPY_VEC_A) { glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE; } if (kflags & KEXTRA_NO_COPY_VEC_B) { glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE; } tsize = dtypeSize(dtype); isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); isConjA = isMatrixConj(kflags, MATRIX_A); areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); customize = (funcID == CLBLAS_TRMM); guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern)); if (guard == NULL) { return -ENOMEM; } memset(&pattern, 0, sizeof(pattern)); pattern.zeroing = false; pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_IMAGE; pattern.dtype = dtype; pattern.flags = 0; pattern.generic = false; pattern.pgran = pgran; if (!(customize && (isTra || isConjA))) { pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local optimized pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; if (customize || isComplexType(dtype)) { pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) { pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } // Global to local generic pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.generic = true; if (!customize || areTails) { pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local transposed functions pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ? DBLOCK_COPY_NOT_VECTORIZE : 0; pattern.flags |= glcpFlags[0]; if (!customize || isTra) { pattern.generic = false; if (isConjA) { pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags |= DBLOCK_COPY_TRANSPOSE; } pattern.dim.x = dims[1].y; pattern.dim.y = dims[0].bwidth; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if (!customize || (isTra && areTails)) { pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.generic = false; pattern.dim.x = dims[1].x; pattern.dim.y = dims[0].bwidth; if (kflags & KEXTRA_CONJUGATE_B) { pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags = DBLOCK_COPY_TRANSPOSE; } pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // generate two local zeroing functions for matrix A and matrix B blocks pattern.zeroing = true; pattern.dim = dims[0]; pattern.generic = false; pattern.flags = 0; pattern.dim.y = 1; pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_B], FUNC_NAME_MAXLEN); ret = kgenAddBlankLine(ctx); destroyKgenGuard(guard); return ret; }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; //yes, KEXTRA_TAILS_K because it is set if N % bw != 0 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); char tmp[4096]; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; TilePostFetchPrivate pfPriv; unsigned int vecLen = kextra->vecLen; const char *outTypeName; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; const char *typeName; size_t wgSize; //unsigned int nStep = 32; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8; unsigned int cLocal; bool isComplex = isComplexType(dtype); unsigned int nPlans; typeName = dtypeBuiltinType(dtype); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // now, generate the kernel declareGemvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "// M always denotes length of Y " "and N denotes length of X in the kernel\n"); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); cLocal = wgSize/bStep; memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.pgran = pgran; gset.kextra = kextra; gset.flags = BGF_UPTRS; initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n", gid, bStep, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < M && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); if (tailN) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "M"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth; mulOpts.flags |= TILEMUL_BW_STRIDE; } sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); if (staggered) { vnames->k = "k1"; sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } vnames->k = "k"; kgenEndBranch(ctx, NULL); /* k loop */ if (tailN) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); kgenBeginBranch(ctx, "if (k < N)"); mulOpts.flags |= TILEMUL_SKEW_B; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL; setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = subdims[1].bwidth; mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep); /* write back the results */ /* y := alpha*A*x + beta*y */ setResultPos(ctx, kflags, vnames->coordA); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
static void initCopyPattern( CopyPattern *pattern, const SubproblemDim *blasDim, KernelExtraFlags flags, MatrixRole mrole, BlasFunctionID funcID) { SubproblemDim *dim = &pattern->dim; unsigned int vecFlag = 0; pattern->flags = 0; if (blasDim == NULL) { pattern->generic = true; dim->x = 0; dim->y = 0; } else { pattern->generic = false; switch (mrole) { case MATRIX_A: dim->x = blasDim->bwidth; dim->y = blasDim->y; break; case MATRIX_B: dim->x = blasDim->bwidth; dim->y = blasDim->x; break; case MATRIX_C: dim->x = blasDim->x; dim->y = blasDim->y; break; default: break; } } switch (mrole) { case MATRIX_A: vecFlag = KEXTRA_NO_COPY_VEC_A; break; case MATRIX_B: vecFlag = KEXTRA_NO_COPY_VEC_B; break; case MATRIX_C: if ((funcID == CLBLAS_TRMM) || (funcID == CLBLAS_TRSM)) { vecFlag = KEXTRA_NO_COPY_VEC_B; } else { vecFlag = KEXTRA_NO_COPY_VEC_C; } break; default: break; } if (flags & vecFlag) { pattern->flags |= DBLOCK_COPY_NOT_VECTORIZE; } if (isMatrixAccessColMaj(funcID, flags, mrole)) { if ((pattern->dir == DBLOCK_GLOBAL_TO_LOCAL) && !pattern->generic) { dimSwapXY(dim); } pattern->flags |= DBLOCK_COPY_TRANSPOSE; } if (isMatrixConj(flags, mrole)) { pattern->flags |= DBLOCK_COPY_CONJUGATE; } }
static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[1024]; struct KgenContext *ctx; ssize_t ret; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; DataType dtype = kextra->dtype; KernelExtraFlags kflags = kextra->flags; CLBLASKernExtra extraNew; BlasGenSettings gset; TileMulOpts mulOpts; const char *ptrName; UpdateResultFlags upFlags = 0; TilePostFetchPrivate pfPriv; unsigned int l1Pans; bool b; Tile parTile; TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv; int ldsLarge, lds_diagonal; bool isInline; TileSet tileSet; char copy2LDSFuncName[FUNC_NAME_MAXLEN]; TailStatus tailStatus = 0; FetchAddrMode addrMode = 0; bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); bool tailN = ((kflags & KEXTRA_TAILS_N) != 0); size_t alignK; if (pgran->wgDim != 1) { return -EINVAL; } l1Pans = (unsigned int)(subdims[0].x / subdims[1].x); memset(&gset, 0, sizeof(gset)); gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS; memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2); // there is not need in block structure along K gset.subdims[0].bwidth = gset.subdims[1].bwidth; subdims = gset.subdims; /* * Since tiles are changed dynamically, e. g. in the main tilemul * loop they are rectangular, but at the second stage both A and B * tile storages are used for square tiles. One must adjust physical * vectorization accordindly, so as vector length might not be * greater than linear size of any tile */ memcpy(&extraNew, kextra, sizeof(extraNew)); extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y); extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y); gset.pgran = pgran; gset.kextra = &extraNew; initKernelVarNames(&gset.varNames); // multiplication options mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.postFetch = NULL; mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags); mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K; mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { return -ENOMEM; } disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING); isInline = (gset.flags & BGF_EXPLICIT_INLINE); initTiles(&gset, &tileSet, subdims, kflags, dtype, PRIV_STORAGE_VARIABLE_SET); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { destroyFetchContext(mulOpts.fctx); return -ENOMEM; } kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n"); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } if(!isInline) { genTileInverting(ctx, &gset, &tileSet); } if ( extraParams->ldsUse != LDS_NO_USE ) { SubproblemDim sdims; DBlockCopyFlags flags; unsigned int vecLen; if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor; sdims.y = gset.subdims[0].x; } else { sdims.x = gset.subdims[0].x; sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor; } vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B); flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0; copyDataBlockGen(ctx, &sdims, gset.pgran, dtype, DBLOCK_GLOBAL_TO_LOCAL, flags); kgenAddBlankLine(ctx); kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx); } declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false, true); kgenBeginFuncBody(ctx); declareLocalVariables(ctx, &gset, &parTile, extraParams); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); ptrName = dtypeUPtrField(dtype); sprintf(tmp, "uB.%s = B;\n\n", ptrName); kgenAddStmt(ctx, tmp); // external loop sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genZeroTile(ctx, &gset.tileCY); genSetupCoords(ctx, &gset, BLOCK_UPDATE); kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n"); gset.tileA = tileSet.rectA; gset.tileBX = tileSet.origB; if (!isMatrixUpper(kflags) && tailM) { addrMode |= FETCH_ADDR_A_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0); alignK = subdims[1].bwidth; if (ldsLarge) { alignK *= extraParams->unrollingFactor; } if (ldsLarge) { const char *oldCoordB; FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE; bool withSkew; withSkew = useSkewedFetchB(&gset); if (!withSkew) { bamode |= FETCH_ADDR_B_RELATIVE; } else { bamode |= FETCH_ADDR_B_CYCLICAL; } setFetchAddrMode(mulOpts.fctx, bamode); if (tailN) { /* * Conditional branch for those items which hit into * matrix B with their matrix coordinates */ sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x); kgenBeginBranch(ctx, tmp); } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) { kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName); } else { kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName); } if (withSkew) { unsigned int bwidthOld; oldCoordB = gset.varNames.coordB; gset.varNames.coordB = "skewX"; bwidthOld = gset.subdims[0].bwidth; gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows : parTile.nrCols; gset.subdims[0].bwidth = bwidthOld; } genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK); genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName); genInternalLoopEnd(ctx); // loop over K if (withSkew) { gset.varNames.coordB = oldCoordB; setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL); // deliver from skew in the result before proceed to the next stage genTileCyclicalShift(ctx, &gset); } if (tailN) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); } setFetchAddrMode(mulOpts.fctx, addrMode); } if (!ldsLarge || tailN) { genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } if (tailN) { sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")"); kgenBeginBranch(ctx, tmp); } genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK); tileMulGen(ctx, &gset, &mulOpts); genInternalLoopEnd(ctx); // loop over K if (tailN) { kgenEndBranch(ctx, NULL); } if (extraParams->ldsUse & LDS_USE_LARGE) { kgenEndBranch(ctx, NULL); } } sprintf(tmp, "uA.%s = A;\n\n", ptrName); kgenAddStmt(ctx, tmp); // processing tails along update dimension if (isMatrixUpper(kflags) && ((kflags & KEXTRA_TAILS_K_LOWER) || (ldsLarge && extraParams->unrolledTail))) { unsigned int tailChunks; tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ? extraParams->unrolledTail : 1; if (tailN) { char hitCond[1024]; sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")"); sprintf(tmp, "if ((currM + %lu < M) && %s)", subdims[0].y, hitCond); } else { sprintf(tmp, "if (currM + %lu < M)", subdims[0].y); } kgenBeginBranch(ctx, tmp); if (kflags & KEXTRA_TAILS_K_LOWER) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL); setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } if (tailChunks > 1) { mulOpts.flags &= ~TILEMUL_NOT_INC_K; sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks); kgenBeginBranch(ctx, tmp); } addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); if (tailChunks > 1) { kgenEndBranch(ctx, NULL); mulOpts.flags |= TILEMUL_NOT_INC_K; } kgenEndBranch(ctx, NULL); } gset.tileA = tileSet.squareA; kgenAddStmt(ctx, "\n/*\n" " * Stage 2. A part of work items multiply got result on " "a respective\n" " * inverted diagonal block, and the remaining ones wait. " "Then they perform\n" " * one step of further intermediate result evaluation as " "multiplying tile by tile.\n" " * It continues until the whole panel of the " "matrix A is processed\n" " */\n"); // one must deal further with square blocks strictly gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y; sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y); kgenBeginBranch(ctx, tmp); if (extraParams->ldsUse & LDS_USE_DIAGONAL) { sprintf(tmp, "const int bid = lid %% %u;\n\n", l1Pans); kgenAddStmt(ctx, tmp); } /* * Update the intermediate result multiply on the inverted diagonal tile, * and write back */ genSetupCoords(ctx, &gset, TILE_UPDATE); sprintfStage2Condition(tmp, &gset, 0); ret = kgenBeginBranch(ctx, tmp); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA; genUpdateIntermResult(ctx, &gset, false, upFlags); kgenAddBlankLine(ctx); lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) && (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 && !(tailM || tailN) && !(upFlags & UPRES_NO_VECTORIZATION) && !isComplexType(kextra->dtype)); /* * it's needed now to adjust addressing mode of A so as to don't * exceed the bound of A */ if (tailM) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_A_CYCLICAL | FETCH_ADDR_K_CYCLICAL); extraNew.flags |= KEXTRA_TAILS_K_LOWER; } genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts); gset.tileBX = tileSet.bStage2; if (tailM) { setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } kgenAddStmt(ctx, "// Write back the given result\n"); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); if (lds_diagonal) { sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans); } genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags, NULL, NULL, lds_diagonal ? tmp : NULL); kgenEndBranch(ctx, NULL); // multiply on the inverted tile path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // continue the tile update kgenAddBlankLine(ctx); sprintfStage2Condition(tmp, &gset, 1); kgenBeginBranch(ctx, tmp); genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if (lds_diagonal) { // TODO: add here storing to LDS as well } else { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); } kgenEndBranch(ctx, NULL); // tile update path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); kgenEndBranch(ctx, NULL); // second stage loop if (isMatrixUpper(kflags)) { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); kgenAddStmt(ctx, tmp); } kgenEndBranch(ctx, NULL); // loop over M ret = kgenEndFuncBody(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyFetchContext(mulOpts.fctx); destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }