// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
void genTest( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, bool separateFetch) { char s[1024]; Kstring kstr; char *tName, tVect[64], *ptrName; KernelVarNames *vnames = &gset->varNames; DataType dtype = gset->kextra->dtype; const SubproblemDim *subdims = gset->subdims; unsigned int vecLen = gset->kextra->vecLen; size_t m, n, k; unsigned int i, j; bool tra, trb, localA, localB, vecCoords; int ret; TileMulFlags flags = mulOpts->flags; FetchOpts fetchOpts; m = gset->subdims[1].y; n = gset->subdims[1].x; k = gset->subdims[1].bwidth; tra = ((flags & TILEMUL_TRA) != 0); trb = ((flags & TILEMUL_TRB) != 0); localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0); tVect[0] = '\0'; if (vecCoords && vecLen != 1) { sprintf(tVect, "%u", vecLen); } switch (dtype) { case TYPE_FLOAT: tName = "float"; ptrName = "f"; break; case TYPE_DOUBLE: tName = "double"; ptrName = "d"; break; case TYPE_COMPLEX_FLOAT: tName = "float2"; ptrName = "f2v"; break; case TYPE_COMPLEX_DOUBLE: tName = "double2"; ptrName = "d2v"; break; default: return; } if (vecCoords) { //Do not use GPtrs in fetching vnames->A = "A"; vnames->B = "B"; } else { vnames->A = localA ? "LAptr" : "((GPtr)A)"; vnames->B = localB ? "LBptr" : "((GPtr)B)"; } if (!localA) { vnames->lda = "lda"; } if (!localB) { vnames->ldb = "ldb"; } vnames->sizeM = "M"; vnames->sizeN = "N"; vnames->sizeK = "K"; vnames->skewA = "skewA"; vnames->skewB = "skewB"; vnames->skewK = "skewK"; vnames->coordA = "workItemM"; vnames->coordB = "workItemN"; vnames->k = "k"; kgenAddBlankLine(ctx); sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n", ITEM_WORK_M, ITEM_WORK_N); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "__kernel void\n"); sprintf(s, "%s(\n", kernelName); kgenAddStmt(ctx, s); sprintf(s," %s alpha,\n", tName); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *A,\n", tName, tVect); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *B,\n", tName, tVect); kgenAddStmt(ctx, s); kgenAddStmt(ctx, " uint M,\n" " uint N,\n" " uint K,\n"); sprintf(s, " __global %s *C,\n" " const uint iter)\n", tName); kgenAddStmt(ctx, s); kgenBeginFuncBody(ctx); sprintf(s, "uint workItemM = %lu * get_global_id(0);\n" "uint workItemN = %lu * get_global_id(1);\n", m, n); kgenAddStmt(ctx, s); if ((flags & TILEMUL_SKEW_A) != 0) { kgenAddStmt(ctx, "uint skewA = 0u;\n"); } if ((flags & TILEMUL_SKEW_B) != 0) { kgenAddStmt(ctx, "uint skewB = 0u;\n"); } if ((flags & TILEMUL_SKEW_K) != 0) { kgenAddStmt(ctx, "uint skewK = 0u;\n"); } if (localA) { sprintf(s, "__local %s LA[%lu];\n", tName, subdims[0].bwidth * subdims[0].y); kgenAddStmt(ctx, s); } else { //global A sprintf(s, "uint lda = %s;\n", tra ? "M" : "K"); kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "__local %s LB[%lu];\n", tName, subdims[0].bwidth * subdims[0].x); kgenAddStmt(ctx, s); } else { //global B sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N"); kgenAddStmt(ctx, s); } initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY); declareTileStorages(ctx, gset); if (vecCoords) { size_t ha, hb; char *str; ha = tra ? k : m; hb = trb ? n : k; if (ha > 1) { str = s; str += sprintf(str, "uint%lu ca = {0", ha); for (i = 1; i < ha; i++) { str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint ca = 0;\n"); } vnames->vectCoordA = "ca"; if (hb > 1) { str = s; str += sprintf(str, "uint%lu cb = {0", hb); for (i = 1; i < hb; i++) { str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint cb = 0;\n"); } vnames->vectCoordB = "cb"; // uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3}; // uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3}; } kgenAddBlankLine(ctx); sprintf(s, "for (int it = 0; it < iter; it++)"); kgenBeginBranch(ctx, s); if (!(localA && localB)) { kgenAddStmt(ctx, "uint k = 0;\n"); } genZeroTile(ctx, &gset->tileCY); if (vecCoords) { char *coordsA[2] = {"workItemM", "k"}; char *coordsB[2] = {"k", "workItemN"}; sprintf(s, "A += %s * (lda / %u) + %s / %u;\n", coordsA[tra], vecLen, coordsA[1 - tra], vecLen); kgenAddStmt(ctx, s); sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n", coordsB[trb], vecLen, coordsB[1 - trb], vecLen); kgenAddStmt(ctx, s); } sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, s); /* Copy data to local memory. We know that the size of matrix is the same * that the size of one block and use that. */ if (localA) { sprintf(s, "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n" "wait_group_events(1, &evA);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].y * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LAptr;\n"); if (tra) { sprintf(s, "LAptr.%s = LA + workItemM;\n", ptrName); } else { sprintf(s, "LAptr.%s = LA + workItemM * %lu;\n", ptrName, subdims[0].bwidth); } kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n" "wait_group_events(1, &evB);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].x * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LBptr;\n"); if (trb) { sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n", ptrName, subdims[0].bwidth); } else { sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName); } kgenAddStmt(ctx, s); } if (!separateFetch) { ret = tileMulGen(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); } else { Tile *tileA = &gset->tileA; Tile *tileB = &gset->tileBX; memset(&fetchOpts, 0, sizeof(fetchOpts)); if (localA) { fetchOpts.memA = CLMEM_LOCAL_MEMORY; } if (localB) { fetchOpts.memB = CLMEM_LOCAL_MEMORY; } genFillTileWithNAN(ctx, tileA); genFillTileWithNAN(ctx, tileB); if (subdims[0].bwidth != subdims[1].bwidth) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, k); kgenBeginBranch(ctx, s); } #if JUST_MULTIPLICATION for (i = 0; i < tileA->nrRows; i++) { for(j = 0; j < tileA->nrCols; j++) { sprintfTileElement(&kstr, tileA, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j); kgenAddStmt(ctx, s); } } for (i = 0; i < tileB->nrRows; i++) { for(j = 0; j < tileB->nrCols; j++) { sprintfTileElement(&kstr, tileB, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j); kgenAddStmt(ctx, s); } } #else fetchOpts.mrole = MATRIX_B; fetchOpts.lineOffset = 0; fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile b"); fetchOpts.mrole = MATRIX_A; fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows; kgenAddBlankLine(ctx); fetchOpts.lineOffset = 0; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile a"); #endif ret = genMulTiles(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); #if ! JUST_MULTIPLICATION sprintf(s, "k += %lu;\n", k); kgenAddStmt(ctx, s); #endif if (subdims[0].bwidth != subdims[1].bwidth) { kgenEndBranch(ctx, NULL); } } kgenEndBranch(ctx, NULL); // K loop kgenEndBranch(ctx, NULL); // iterations loop kgenAddBlankLine(ctx); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sprintfTileElement(&kstr, &gset->tileCY, i, j, 1); sprintf(s, "((GPtr)C).%s" "[(%d + workItemM) * N + %d + workItemN] = %s;\n", ptrName, i, j, kstr.buf); kgenAddStmt(ctx, s); } } kgenEndFuncBody(ctx); }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; //yes, KEXTRA_TAILS_K because it is set if N % bw != 0 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); char tmp[4096]; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; TilePostFetchPrivate pfPriv; unsigned int vecLen = kextra->vecLen; const char *outTypeName; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; const char *typeName; size_t wgSize; //unsigned int nStep = 32; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8; unsigned int cLocal; bool isComplex = isComplexType(dtype); unsigned int nPlans; typeName = dtypeBuiltinType(dtype); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // now, generate the kernel declareGemvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "// M always denotes length of Y " "and N denotes length of X in the kernel\n"); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); cLocal = wgSize/bStep; memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.pgran = pgran; gset.kextra = kextra; gset.flags = BGF_UPTRS; initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n", gid, bStep, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < M && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); if (tailN) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "M"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth; mulOpts.flags |= TILEMUL_BW_STRIDE; } sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); if (staggered) { vnames->k = "k1"; sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } vnames->k = "k"; kgenEndBranch(ctx, NULL); /* k loop */ if (tailN) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); kgenBeginBranch(ctx, "if (k < N)"); mulOpts.flags |= TILEMUL_SKEW_B; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL; setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = subdims[1].bwidth; mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep); /* write back the results */ /* y := alpha*A*x + beta*y */ setResultPos(ctx, kflags, vnames->coordA); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[4096]; struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; int ret; BlasGenSettings gset; TileMulOpts mulOpts; int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); unsigned int l1Pans; TilePostFetchPrivate pfPriv[2]; UpdateResultFlags upResFlags; TailStatus tailStatus; bool subgMode = false; SubgVarNames subgVNames; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // mismatching subdims define case with subgroup decomposition subgMode = ( subdims[0].bwidth != subdims[1].bwidth ); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.flags = BGF_DISTINCT_VECLEN; gset.flags |= BGF_WHOLE_A; /*FIXME: This used to be a workaround for compilation issues with dtrmm on * cpu. Normally BGF_WHOLE_A should be enabled always. But for now, * there are wrong results for non-aligned cases on CPU and there is * no workaround yet. if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) { gset.flags &= ~BGF_WHOLE_A; }*/ gset.kextra = kextra; gset.pgran = pgran; //avoid [0].bw loop //gset.subdims[0].bwidth = gset.subdims[1].bwidth; memset(pfPriv, 0, sizeof(pfPriv)); pfPriv[0].funcID = CLBLAS_TRMM; pfPriv[0].gset = &gset; if ((gset.flags & BGF_WHOLE_A) != 0) { pfPriv[0].wholeA = 1; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // For inner callback, because both callbacks use own fetchNumA memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0])); // if both matrices are accessed row-major - using subgroup pattern if ( subgMode ) { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Subgroup", true, true); gset.flags |= BGF_UPTRS; } else { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Block", true, true); } kgenBeginFuncBody(ctx); initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET); declareTileStorages(ctx, &gset); kgenAddStmt(ctx, "uint currM, currN;\n" "uint4 coord = 0; /* contains coordB, coordA, k */\n"); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); if ( subgMode ) { gset.varNames.LDS = "scratch"; // declaring variables used by subgroup mode subgVNames.itemId = "itemId"; subgVNames.subgCoord = "subgCoord"; kgenAddBlankLine( ctx ); kgenAddBlankLine(ctx); kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId ); kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord); // item ID kgenPrintf( ctx, "%s.x = get_local_id(0)%%%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup ID kgenPrintf( ctx, "%s.y = get_local_id(0)/%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup coordX kgenPrintf( ctx, "%s.x = %s.y/%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); // subgroup coordY kgenPrintf( ctx, "%s.y = %s.y%%%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); } kgenAddBlankLine(ctx); sprintf(tmp, "currN = gid * %lu;\n", subdims->x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, true); if ( subgMode ) { kgenAddStmt(ctx, "GPtr Ag = {A};\n" "GPtr Bg = {B};\n"); } l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; memset(&mulOpts, 0, sizeof(mulOpts)); mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = &pfPriv; mulOpts.flags = TILEMUL_NO_FLAGS; mulOpts.flags |= TILEMUL_EXTERN_RDECL; if ( subgMode ) { mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.flags |= TILEMUL_BW_STRIDE; } if (kflags & KEXTRA_TAILS_M_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } if (kflags & KEXTRA_TAILS_N_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } if (kflags & KEXTRA_TAILS_K_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL; } if (tra) { mulOpts.flags |= TILEMUL_TRA; } if (!trb) { mulOpts.flags |= TILEMUL_TRB; } if (isMatrixConj(kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixConj(kflags, MATRIX_B)) { mulOpts.flags |= TILEMUL_CONJB; } initKernelVarNames(&gset.varNames); if ( subgMode ) { kgenPrintf( ctx, "coord.x = currN + %s.x*%d;\n", subgVNames.subgCoord, subdims[1].x ); } else { sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); } // loop over M sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genStartPosK( ctx, subdims, kflags, subgMode ); sprintf(tmp, "coord.z = kBegin;\n"); kgenAddStmt(ctx, tmp); if ( subgMode ) { kgenPrintf(ctx, "coord.y = currM + %s.y*%d;\n", subgVNames.subgCoord, subdims[1].y); } else { sprintf( tmp, "coord.y = currM + lid / %u * %lu;\n", l1Pans, subdims[1].y ); kgenAddStmt(ctx, tmp); } genZeroTile(ctx, &gset.tileCY); checkGenBeginHitMatrixBlock(ctx, kflags); tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL); // loops along 'K' if ( subgMode ) { ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered); } else { ret = genLoopsK( ctx, &gset, &mulOpts, tmp ); } if (ret != 0) { printf("%s", buf); return ret; } checkGenEndHitMatrixBlock(ctx, kflags); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // store results // for result update - x coordinate is in elements, not in vectors checkGenRestoreTailCoords(ctx, &gset, tailStatus); upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags); upResFlags |= tailStatusToUpresFlags(tailStatus); upResFlags |= UPRES_INDEXING_WITH_CONSTANTS; upResFlags |= UPRES_TRIANG_WRITE_C; upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION; if ( subgMode ) { mergeUpdateResult( ctx, CLBLAS_TRMM, &gset, &subgVNames, upResFlags, genResultUpdateWithFlags ); } else { //checkGenBeginHitMatrixBlock(ctx, kflags); genResultUpdateWithFlags( ctx, CLBLAS_TRMM, &gset, upResFlags, NULL, NULL, NULL ); //checkGenEndHitMatrixBlock(ctx, kflags); } if (isMatrixUpper(kflags)) { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }