/* * isFitToLDS() * * 1. We will assume "dim[0].y" as the TRIANGLE_HEIGHT oiow - The number of variables solved * by the corresponding TRTRI kernel * * NOTE: * 1. It is Possible that this function can cause "dim[0].y" to change from what was used in * the "trtri" counterpart. * In such a case, we will detect this in "xtrsv.c" and abort the TRSV call. * 2. We may need to mellow down the bloated numbers we are returning down here. */ static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs; size_t MAXBLOCKSIZE = 256; cl_ulong maxSize; if ( ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) || ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor)) ) { // // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type? // cl_ulong tw; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); if (tw == 0) { do { MAXBLOCKSIZE /= 2; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); } while((MAXBLOCKSIZE > 1) && (tw == 0)); } #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isFitLDS() tw = %lu\n", tw); #endif maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4; #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isFitLDS() maxSize = %lu, ldsSize = %lu, Y = %lu\n", maxSize, ldsSize, dim[0].y); #endif return (maxSize < ldsSize); } // // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS. // We will assume "dim[0].y" to be the "TriangleWidth" // MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y; maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype); return (maxSize < ldsSize); }
clblasStatus VISIBILITY_HIDDEN checkVectorSizes( DataType dtype, size_t N, cl_mem x, size_t offx, int incx, ErrorCodeSet err ) { size_t memSize, sizev; size_t tsize; if (N == 0) { return clblasInvalidDim; } if (incx == 0) { switch( err ) { case X_VEC_ERRSET: return clblasInvalidIncX; case Y_VEC_ERRSET: return clblasInvalidIncY; default: return clblasNotImplemented; } } if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) != CL_SUCCESS) { switch( err ) { case X_VEC_ERRSET: return clblasInvalidVecX; case Y_VEC_ERRSET: return clblasInvalidVecY; default: return clblasNotImplemented; } } tsize = dtypeSize(dtype); sizev = ((N - 1) * abs(incx) + 1) * tsize; offx *= tsize; if ((offx + sizev > memSize) || (offx + sizev < offx)) { switch( err ) { case X_VEC_ERRSET: return clblasInsufficientMemVecX; case Y_VEC_ERRSET: return clblasInsufficientMemVecY; default: return clblasNotImplemented; } } return clblasSuccess; }
static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side); return (size * dtypeSize(dtype) <= ldsSize); }
int copyDataBlockGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, DataType dtype, DBlockCopyDirection dir, DBlockCopyFlags flags) { int r; GenPriv gpriv; unsigned int tsize; tsize = dtypeSize(dtype); if (dir == DBLOCK_LOCAL_TO_IMAGE || dir == DBLOCK_GLOBAL_TO_IMAGE) { size_t rowSize; if (dim != NULL) { rowSize = tsize * dim->x; if (rowSize % sizeof(cl_float4) != 0) { // only float4 aligned rows are supported return -EINVAL; } } if (flags & DBLOCK_COPY_TRANSPOSE) { return -EINVAL; } } memset(&gpriv, 0, sizeof(gpriv)); gpriv.transp = (flags & DBLOCK_COPY_TRANSPOSE); gpriv.packed = (flags & DBLOCK_COPY_PACKED_IMAGE); if (dtype != TYPE_COMPLEX_DOUBLE) { gpriv.notVectorize = (flags & DBLOCK_COPY_NOT_VECTORIZE); } if ((flags & DBLOCK_COPY_CONJUGATE) && isComplexType(dtype)) { gpriv.conjugate = true; } initGenPriv(&gpriv, dtype, tsize, dim ,dir, NULL, pgran); if (dim) { r = copyDBlockOptimGen(ctx, dim, pgran, &gpriv); } else { r = copyDBlockGenericGen(ctx, pgran, &gpriv); } return r; }
int generateZeroingFuncs( ZeroFuncs *funcNames, struct KgenContext *ctx, const SubproblemDim *blasDim, const PGranularity *pgran, DataType dtype, ZeroGenHelperFlags flags) { int ret = 0; SubproblemDim dim[MATRIX_ROLES_NUMBER]; size_t tsize, nvecs; unsigned int i, j; tsize = dtypeSize(dtype); nvecs = fl4RowWidth(blasDim->bwidth, tsize); checkInitSubdim(&dim[MATRIX_A], flags, ZF_MATRIX_A, nvecs * blasDim->y, 1); checkInitSubdim(&dim[MATRIX_B], flags, ZF_MATRIX_B, nvecs * blasDim->x, 1); nvecs = fl4RowWidth(blasDim->x, tsize); checkInitSubdim(&dim[MATRIX_C], flags, ZF_MATRIX_C, nvecs * blasDim->y, 1); for (i = 0; (i < MATRIX_ROLES_NUMBER) && !ret; i++) { if (dim[i].x == SUBDIM_UNUSED) { continue; } // check whether the function is already generated j = lookupDim(dim, i); if (j != IDX_INVAL) { strcpy(funcNames->names[i], funcNames->names[j]); } else { ret = f4zeroBlockGen(ctx, &dim[i], pgran, "__local"); if (!ret) { kgenGetLastFuncName(funcNames->names[i], FUNC_NAME_MAXLEN, ctx); } kgenAddBlankLine(ctx); } } return ret; }
static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; (void)kernelArgs; /* * One needs y1 * wgSize size of local memory in elements, * but y1 is not calculated yet. The expression below produces * reliable a larger value. It is larger in dims[1].bwidth times. */ size = dim[0].y * dim[0].bwidth * dtypeSize(dtype); return (size <= ldsSize); }
clblasStatus VISIBILITY_HIDDEN checkBandedMatrixSizes( DataType dtype, clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_mem A, size_t offA, size_t lda, ErrorCodeSet err ) { size_t memSize, matrSize, tsize, K, memUsed; size_t unusedTail = 0; bool tra; if ((M == 0) || (N == 0)) { return clblasInvalidDim; } tsize = dtypeSize(dtype); K = KL + KU + 1; tra = (order == clblasRowMajor && transA != clblasNoTrans) || (order == clblasColumnMajor && transA == clblasNoTrans); if (lda < K) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidLeadDimA; case B_MAT_ERRSET: return clblasInvalidLeadDimB; case C_MAT_ERRSET: return clblasInvalidLeadDimC; default: return clblasNotImplemented; } } if (tra) { matrSize = ((N - 1) * lda + K) * tsize; unusedTail = ( lda - N ) * tsize; } else { matrSize = ((M - 1) * lda + K) * tsize; unusedTail = ( lda - M ) * tsize; } offA *= tsize; if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) != CL_SUCCESS) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; default: return clblasNotImplemented; } } // Calculates the memory required. Note that 'matrSize' already takes into account the fact that // there might be an unused tail, i.e. the elements between lda and M in the last column if // column major is used or between lda and N in the last row if row major is used. memUsed = offA + matrSize; if (memUsed > memSize) { switch( err ) { case A_MAT_ERRSET: return clblasInsufficientMemMatA; case B_MAT_ERRSET: return clblasInsufficientMemMatB; case C_MAT_ERRSET: return clblasInsufficientMemMatC; default: return clblasNotImplemented; } } return clblasSuccess; }
/* * Assign a scalar multiplied on a matrix a kernel argument */ void VISIBILITY_HIDDEN assignScalarKarg(KernelArg *arg, const void *value, DataType dtype) { arg->typeSize = dtypeSize(dtype); memcpy(arg->arg.data, value, arg->typeSize); }
static void declareLocalVariables( struct KgenContext *ctx, const BlasGenSettings *gset, Tile* parTile, TrsmExtraParams * extraParams) { char tmp[1024]; const SubproblemDim *dims = gset->subdims; const char* parTileTypeName = NULL; bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags, MATRIX_B); unsigned int locWidth; unsigned int tsize; unsigned int parTileSize; unsigned int l1Pans; unsigned int step; kgenAddStmt(ctx, "const int lid = get_local_id(0);\n" "const int gid = get_group_id(0);\n" "GPtr uA, uB;\n" "uint coordA, coordB;\n" "uint m0 = 0, k0, m1;\n"); if (isMatrixUpper(gset->kextra->flags)) { sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n", dims[0].y, dims[0].y); kgenAddStmt(ctx, tmp); } /* * Declare private blocks. * The region 'b' stores in different time tiles of both * the input matrices and the result */ declareTileStorages(ctx, gset); *parTile = gset->tileBX; if (extraParams->ldsUse) { tsize = dtypeSize(gset->kextra->dtype); l1Pans = (unsigned int)(dims[0].x / dims[1].x); parTile->vecLen = (trb) ? (unsigned int)dims[1].x : (unsigned int)dims[1].bwidth; parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize); parTile->trans = trb; /* * Allocate enough space in the local area to fit several tiles * at the stage1 (according to the unrolled factor) and one tile * at the stage2 */ locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor; if (extraParams->ldsUse & LDS_USE_DIAGONAL) { locWidth = umax(locWidth, (unsigned int)dims[1].y); } if (trb) { parTile->nrRows = locWidth; parTile->nrCols = (unsigned int)dims[0].x; step = (unsigned int)dims[1].x / parTile->vecLen; } else { parTile->nrRows = (unsigned int)dims[0].x; parTile->nrCols = locWidth; step = (unsigned int)dims[1].x * locWidth / parTile->vecLen; } parTileSize = tileVectorsNum(parTile); getVectorTypeName(gset->kextra->dtype, parTile->vecLen, &parTileTypeName, NULL); sprintf(tmp, "__local %s tmpB[%i];\n" "LPtr lB;\n" "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n", parTileTypeName, parTileSize, l1Pans, step); kgenAddStmt(ctx, tmp); if (useSkewedFetchB(gset)) { kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n", l1Pans, gset->subdims[1].x); } } kgenAddBlankLine(ctx); }
static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CLBlasKargs *kargs = (CLBlasKargs*)args; TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv; size_t loadBatch; unsigned int wgSize; unsigned int workRatio; unsigned int ldsUse = LDS_NO_USE; KernelExtraFlags kflags = kextra->flags; SubproblemDim globDim; bool isAmdGPU; /* * Calculate size of the batch loaded from global to local memory * at each iteration of the stage 1. Choose such unrolling factor * that allow each work item to load at least 16 bytes that provides * efficient global memory access */ loadBatch = subdims[0].x * subdims[1].bwidth * dtypeSize(kargs->dtype); wgSize = (unsigned int)((subdims[0].x / subdims[1].itemX) * (subdims[0].y / subdims[1].itemY)); if (loadBatch < wgSize) { workRatio = 1; } else { workRatio = 16 / ((unsigned int)loadBatch / wgSize); if (!workRatio) { workRatio = 1; } } #ifndef NDEBUG { const char *envImpl = getenv("AMD_CLBLAS_TRSM_LDSUSE"); if (envImpl != NULL) { unsigned int w = atoi(envImpl); ldsUse = w % 10; w = w / 10; workRatio = w > 0 ? w : workRatio; } } #endif ldsUse = LDS_NO_USE; isAmdGPU = ((kflags & KEXTRA_VENDOR_AMD) != 0); if ((isAmdGPU && !(kflags & (KEXTRA_TAILS_K_LOWER | KEXTRA_TAILS_M_LOWER))) || (!isAmdGPU && !(kflags & KEXTRA_TAILS_M))) { ldsUse = LDS_USE_LARGE; } kargsToProbDims(&globDim, CLBLAS_TRSM, args, false); extraParams->ldsUse = ldsUse; extraParams->unrollingFactor = workRatio; extraParams->unrolledTail = (unsigned int)(((globDim.bwidth % (subdims[1].bwidth * workRatio)) + subdims[1].bwidth - 1) / subdims[1].bwidth); fixupTrxmKargs(kargs); }
// global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[4096], tmp1[4096]; char *p; // is the iteration over N, N at the top level const char *typeName; char fpref; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; unsigned int tsize; unsigned int vecLen, outVecLen; bool b; const char *outTypeName; unsigned int i; unsigned int nrRegs, regPitch; int tra, trb; char vect[2] = {'y', 'x'}; const char *coordConstants = "const uint workItemM = get_global_id(0) * %lu;\n" "const uint workItemN = get_global_id(1) * %lu;\n" "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" "uint vectK = (K + %u) / %u;\n"; /* * template for image based gemm preparation part * for two dimensional work space */ const char *localVariables = "uint k0;\n" "int2 coordA = (int2)(0, workItemM);\n" "int2 coordB = (int2)(0, workItemN);\n" "%s c[%u];\n\n"; tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / dtypeSize(dtype); if (isComplexType(dtype)) { regPitch = (unsigned int)subdims[1].x; } else { regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * sizeof(cl_float4) / tsize; } memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kextra->flags); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // now, generate the kernel sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // constants sprintf(tmp, coordConstants, subdims[1].y, subdims[1].x, subdims[1].y, vecLen - 1, vecLen); kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then declare local * variables */ getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, localVariables, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); // check if offset exceeds matrix kgenAddStmt(ctx, "if ((workItemM >= M) ||" "(workItemN >= N)) {\n" " return;\n" "}\n"); kgenAddStmt(ctx, "C += offsetC;\n"); // zero C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); // block multiplication inlined function sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)", subdims[1].bwidth / vecLen); kgenBeginBranch(ctx, tmp); mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_IMAGE; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } mulOpts.argNames.coordA = "coordA"; mulOpts.argNames.coordB = "coordB"; mulOpts.argNames.skewCol = "skewCol"; mulOpts.argNames.skewRow = "skewRow"; mulOpts.argNames.k = "k0"; mulOpts.argNames.vectBoundK = "vectK"; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } // update image coordinates sprintf(tmp, "\ncoordA.x += %lu;\n" "coordB.x += %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // reorder the given solution outVecLen = isComplexType(dtype) ? 1 : vecLen; p = tmp1; for (i = 0; i < regPitch / outVecLen; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * regPitch / outVecLen + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, regPitch / outVecLen, i, regPitch / outVecLen, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (k0 = 0; k0 < skewRow.y; k0++) {\n" " int j;\n" " %s tmp;\n" "%s" "}\n" "\n", outTypeName, tmp1); kgenAddStmt(ctx, tmp); tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B); sprintf(tmp, "coordA.%c = workItemM;\n" "coordB.%c = workItemN;\n\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); // write back the tile evaluated generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
// Preparation function for images based kernel generator static ssize_t preparator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; char tmp[4096], conjStr[1024]; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CopyImgFuncs copyImgFuncs; DataType dtype = kextra->dtype; BlasGenSettings gset; unsigned int vecLen; unsigned int tsize; const char *typeName; char fpref; bool b; size_t localBufSize; ssize_t ret; const char *conjCond; const char *functionHeadA = "int tra, aligned;\n" "const uint bpr = (K + %lu) / %lu;\n" "uint m = (gid / bpr) * %lu;\n" "uint k = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s temp[%lu];\n" "\n" "A += offsetA;\n" "tra = (!transA && order == clblasColumnMajor) ||\n" " (transA && order == clblasRowMajor);\n" "if (m >= M) {\n" " return;\n" "}\n"; const char *functionHeadB = "int trb, aligned;\n" "const uint bpr = (K + %lu) / %lu;\n" "const uint n = (gid / bpr) * %lu;\n" "const uint k = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s temp[%lu];\n" "\n" "B += offsetB;\n" "trb = (!transB && order == clblasRowMajor) ||\n" " (transB && order == clblasColumnMajor);\n" "if (n >= N) {\n" " return;\n" "}\n"; // Distribute blocks across compute units and copy matrix A to image. // Transposition and filling with zeros in unaligned cases is made using // buffer in local memory. const char *copyToImageA = "//copy matrix A block\n" "y = m + %u <= M ? %u : M - m;\n" "x = k + %u <= K ? %u : K - k;\n" "aligned = (x == %u) && (y == %u) && %d;\n" "int atcase = aligned * 10 + tra;\n" "%s" // conjugated check "if (atcase != 10) {\n" " %s((__local float4*)temp);\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "switch(atcase) {\n" "case 10: //aligned, not transposed\n" " %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n" " break;\n" "%s" // conjugated case "case 1: //not aligned, transposed\n" " // generic transposed global to local\n" " %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n" " break;\n" "case 0: //not aligned, not transposed\n" " // generic global to local\n" " %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n" " break;\n" "case 11: //aligned, transposed\n" " // optimized transposed global to local\n" " %s((LPtr) temp, (GPtr)A, k, m, lda);\n" " break;\n" "}\n" "if (atcase != 10) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s(imgA, k / %u, m, (LPtr) temp);\n" "}\n" "\n"; const char *copyToImageB = "//copy matrix B block\n" "y = n + %u <= N ? %u : N - n;\n" "x = k + %u <= K ? %u : K - k;\n" "aligned = (x == %u) && (y == %u) && %d;\n" "int atcase = aligned * 10 + trb;\n" "%s" // conjugated check "if (atcase != 10) {\n" " %s((__local float4*)temp);\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "switch (atcase) {\n" "case 10: //aligned, not transposed\n" " %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n" " break;\n" "%s" // conjugated case "case 1: //not aligned, transposed\n" " // generic transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n" " break;\n" "case 0: //not aligned, not transposed\n" " // generic global to local\n" " %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n" " break;\n" "case 11: //transposed, aligned\n" " // optimized transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, ldb);\n" " break;\n" "}\n" "if (atcase != 10) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s(imgB, k / %u, n, (LPtr)temp);\n" "}\n" "\n"; memset(©ImgFuncs, 0, sizeof(copyImgFuncs)); memset(&gset, 0, sizeof(gset)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } tsize = dtypeSize(dtype); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); declareBlasEnums(ctx); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; // generate necessary memory to image copying functions generateImageCopyFuncs(©ImgFuncs, ctx, CLBLAS_GEMM, &gset); kgenAddBlankLine(ctx); vecLen = sizeof(cl_float4) / dtypeSize(dtype); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); if (kextra->kernType == CLBLAS_PREP_A_KERNEL) { sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // same local buffer is used for both matrix A and matrix B blocks localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; kgenDeclareGroupID(ctx, "gid", pgran); sprintf(tmp, functionHeadA, subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); if (isComplexType(dtype)) { conjCond = "atcase += ((atcase == 10) && " "(transA == clblasConjTrans)) ? 100 : 0;\n"; sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" " %s((LPtr)temp, (GPtr)A, m, k, lda);\n" " break;\n", copyImgFuncs.globalToLocal[MATRIX_A]); } else { conjCond = ""; strcpy(conjStr, ""); } sprintf(tmp, copyToImageA, subdims[1].y, subdims[1].y, // y = m + dy <= M ?... subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1) (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0, conjCond, copyImgFuncs.zeroBlock[MATRIX_A], copyImgFuncs.globalToImage[MATRIX_A], vecLen, conjStr, copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A], subdims[1].bwidth, copyImgFuncs.globalToLocalGeneric[MATRIX_A], subdims[1].bwidth, copyImgFuncs.globalToLocalTransposed[MATRIX_A], copyImgFuncs.localToImage[MATRIX_A], vecLen); kgenAddStmt(ctx, tmp); } else { // PREP_B sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // same local buffer is used for both matrix A and matrix B blocks localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; kgenDeclareGroupID(ctx, "gid", pgran); sprintf(tmp, functionHeadB, subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].x, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); if (isComplexType(dtype)) { conjCond = "atcase += ((atcase == 10) && " "(transB == clblasConjTrans)) ? 100 : 0;\n"; sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" " %s((LPtr)temp, (GPtr)B, n, k, ldb);\n" " break;\n", copyImgFuncs.globalToLocal[MATRIX_B]); } else { conjCond = ""; strcpy(conjStr, ""); } sprintf(tmp, copyToImageB, subdims[1].x, subdims[1].x, // y = n + dy <= N ?... subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1) (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0, conjCond, copyImgFuncs.zeroBlock[MATRIX_B], copyImgFuncs.globalToImage[MATRIX_B], vecLen, conjStr, copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs.globalToLocalGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs.globalToLocalTransposed[MATRIX_B], copyImgFuncs.localToImage[MATRIX_B], vecLen); kgenAddStmt(ctx, tmp); } kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; }
static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float); //EINVAL if( (subdimsNum<2)|| (NULL==pgran)|| (NULL==subdims) ){ return false; } if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 32 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 / divider1 || subdims[1].x > 1 || subdims[1].y > 16 / divider1 ){ return false; } if( subdims[0].bwidth > 128 || subdims[0].x > 1 || subdims[0].y > 128 ){ return false; } if (64 != (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth)) { return false; } if (subdims[0].y > subdims[0].bwidth && subdims[0].y / subdims[0].bwidth < (subdims[0].bwidth / subdims[1].bwidth)) { return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[1] = 1; pgran->wgSize[0] = 64; //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) / // (subdims[0].y / subdims[1].y); } /*Debug out for Tune*/ return true; }
int generateImageCopyFuncs( CopyImgFuncs *copyFuncs, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset) { const SubproblemDim *dims = gset->subdims; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; const PGranularity *pgran = gset->pgran; CopyPattern pattern; // mandatory flags for global to local copying DBlockCopyFlags glcpFlags[2] = {0, 0}; struct KgenGuard *guard; unsigned int tsize; int ret = 0; bool isTra, areTails, isConjA; bool customize; if (kflags & KEXTRA_NO_COPY_VEC_A) { glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE; } if (kflags & KEXTRA_NO_COPY_VEC_B) { glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE; } tsize = dtypeSize(dtype); isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); isConjA = isMatrixConj(kflags, MATRIX_A); areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); customize = (funcID == CLBLAS_TRMM); guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern)); if (guard == NULL) { return -ENOMEM; } memset(&pattern, 0, sizeof(pattern)); pattern.zeroing = false; pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_IMAGE; pattern.dtype = dtype; pattern.flags = 0; pattern.generic = false; pattern.pgran = pgran; if (!(customize && (isTra || isConjA))) { pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local optimized pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; if (customize || isComplexType(dtype)) { pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) { pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } // Global to local generic pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.generic = true; if (!customize || areTails) { pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local transposed functions pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ? DBLOCK_COPY_NOT_VECTORIZE : 0; pattern.flags |= glcpFlags[0]; if (!customize || isTra) { pattern.generic = false; if (isConjA) { pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags |= DBLOCK_COPY_TRANSPOSE; } pattern.dim.x = dims[1].y; pattern.dim.y = dims[0].bwidth; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if (!customize || (isTra && areTails)) { pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.generic = false; pattern.dim.x = dims[1].x; pattern.dim.y = dims[0].bwidth; if (kflags & KEXTRA_CONJUGATE_B) { pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags = DBLOCK_COPY_TRANSPOSE; } pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // generate two local zeroing functions for matrix A and matrix B blocks pattern.zeroing = true; pattern.dim = dims[0]; pattern.generic = false; pattern.flags = 0; pattern.dim.y = 1; pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_B], FUNC_NAME_MAXLEN); ret = kgenAddBlankLine(ctx); destroyKgenGuard(guard); return ret; }
cl_int run ( const char *ker, cl_uint M, cl_uint N, cl_uint K, FType alpha, BlasGenSettings *gset, TileMulFlags flags, cl_device_type deviceType, bool verbose, unsigned int iterNum) { cl_int err; cl_platform_id platform; cl_context ctx; cl_device_id device; cl_command_queue queue; cl_event evt; DataType dtype = gset->kextra->dtype; cl_mem bufA, bufB, bufC; FPtr A, B, C, C_naive; bool isComplex = isComplexType(dtype); bool isDouble = isDoubleBasedType(dtype); cl_uint nwords = (isComplex) ? 2 : 1; unsigned int tsize = dtypeSize(dtype); cl_kernel kernel; size_t i, j, k; size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; char log[100000]; size_t logSize; cl_long sTime, fTime; cl_program program = NULL; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, deviceType, 1, &device, NULL); ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { return err; } queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { return err; } /* Prepare OpenCL kernel and its arguments */ program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL); err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, &logSize); printf("%s", log); if (err != CL_SUCCESS){ clReleaseProgram(program); return err; } kernel = clCreateKernel(program, kernelName, &err); if (err != CL_SUCCESS){ clReleaseProgram(program); return err; } /* Memory allocation */ A.v = malloc(M * K * tsize); B.v = malloc(K * N * tsize); C.v = malloc(M * N * tsize); C_naive.v = malloc(M * N * tsize); #if JUST_MULTIPLICATION srand(0); if (isDouble) { for(i = 0; i < M * K * nwords; i++){ A.d[i] = i; } for(i = 0; i < N * K * nwords; i++){ B.d[i] = i + 7; } for(i = 0; i < M * N * nwords; i++){ C.d[i] = 0.0; C_naive.d[i] = 0.0; } } else { for(i = 0; i < M * K * nwords; i++){ A.f[i] = i; } for(i = 0; i < N * K * nwords; i++){ B.f[i] = i + 7; } for(i = 0; i < M * N * nwords; i++){ C.f[i] = 0.0; C_naive.f[i] = 0.0; } } #else srand(0); if (isDouble) { for(i = 0; i < M * K * nwords; i++){ A.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.d[i] = 0.0; C_naive.d[i] = 0.0; } } else { for(i = 0; i < M * K * nwords; i++){ A.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.f[i] = 0.0; C_naive.f[i] = 0.0; } } #endif bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, K * M * tsize, A.v, &err); if (err != CL_SUCCESS) { clReleaseKernel(kernel); return err; } bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, K * N * tsize, B.v, &err); if (err != CL_SUCCESS) { clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, M * N * tsize, C.v, &err); if (err != CL_SUCCESS) { clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } /* Argument setting and kernel execution */ err = clSetKernelArg(kernel, 0, tsize, alpha.u); err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA); err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB); err |= clSetKernelArg(kernel, 3, sizeof(M), &M); err |= clSetKernelArg(kernel, 4, sizeof(N), &N); err |= clSetKernelArg(kernel, 5, sizeof(K), &K); err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC); err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evt); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } err = clFinish(queue); err = clEnqueueReadBuffer (queue, bufC, CL_TRUE, 0, M * N * tsize, C.v, 0, NULL, NULL); /* Naive CPU multiplication */ if (isDouble) { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { cl_double2 val; for (k = 0; k < K; k++) { cl_double2 bkj = flags & TILEMUL_TRB ? B.d2[j * K + k] : B.d2[k * N + j]; cl_double2 aik = flags & TILEMUL_TRA ? A.d2[k * M + i] : A.d2[i * K + k]; val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; C_naive.d2[i * N + j].s[0] += val.s[0]; C_naive.d2[i * N + j].s[1] += val.s[1]; } val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] - C_naive.d2[i * N + j].s[1] * alpha.d2.s[1]; val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] + C_naive.d2[i * N + j].s[1] * alpha.d2.s[0]; C_naive.d2[i * N + j] = val; } else { for (k = 0; k < K; k++) { double bkj = flags & TILEMUL_TRB ? B.d[j * K + k] : B.d[k * N + j]; double aik = flags & TILEMUL_TRA ? A.d[k * M + i] : A.d[i * K + k]; C_naive.d[i * N + j] += aik * bkj; } C_naive.d[i * N + j] *= alpha.d; } } } for (i = 0; i < M * N; i++) { if (C.d[i] != C_naive.d[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.d[i], C_naive.d[i]); break; } } if (i == M * N) { printf("Match\n"); } } else { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { cl_float2 val; for (k = 0; k < K; k++) { cl_float2 bkj = flags & TILEMUL_TRB ? B.f2[j * K + k] : B.f2[k * N + j]; cl_float2 aik = flags & TILEMUL_TRA ? A.f2[k * M + i] : A.f2[i * K + k]; val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; C_naive.f2[i * N + j].s[0] += val.s[0]; C_naive.f2[i * N + j].s[1] += val.s[1]; } val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] - C_naive.f2[i * N + j].s[1] * alpha.f2.s[1]; val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] + C_naive.f2[i * N + j].s[1] * alpha.f2.s[0]; C_naive.f2[i * N + j] = val; } else { for (k = 0; k < K; k++) { float bkj = flags & TILEMUL_TRB ? B.f[j * K + k] : B.f[k * N + j]; float aik = flags & TILEMUL_TRA ? A.f[k * M + i] : A.f[i * K + k]; C_naive.f[i * N + j] += aik * bkj; } C_naive.f[i * N + j] *= alpha.f; } } } for (i = 0; i < M * N; i++) { if (C.f[i] != C_naive.f[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.f[i], C_naive.f[i]); break; } } if (i == M * N) { printf("Match\n"); } } /* End of naive CPU multiplication */ if (verbose) { if (!isDouble) { printf("Matrix A:\n"); for (i = 0; i < M; i++) { for (k = 0; k < K; k++) { if (isComplex) { cl_float2 aik = flags & TILEMUL_TRA ? A.f2[k * M + i] : A.f2[i * K + k]; printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]); } else { float aik = flags & TILEMUL_TRA ? A.f[k * M + i] : A.f[i * K + k]; printf("%4.1f ", aik); } } printf("\n"); } printf("Matrix B:\n"); for (k = 0; k < K; k++) { for (j = 0; j < N; j++) { if (isComplex) { cl_float2 bkj = flags & TILEMUL_TRB ? B.f2[j * K + k] : B.f2[k * N + j]; printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]); } else { float bkj = flags & TILEMUL_TRB ? B.f[j * K + k] : B.f[k * N + j]; printf("%4.1f ", bkj); } } printf("\n"); } printf("CPU calculated matrix:\n"); for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { printf("(%4.1f, %4.1f) ", C_naive.f2[i * N + j].s[0], C_naive.f2[i * N + j].s[1]); } else { printf("%4.1f ", C_naive.f[i * N + j]); } } printf("\n"); } printf("GPU calculated matrix:\n"); for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { printf("(%4.1f, %4.1f) ", C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]); } else { printf("%4.1f ", C.f[i * N + j]); } } printf("\n"); } } } clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &sTime, NULL); clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &fTime, NULL); printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n", (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum)); clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return CL_SUCCESS; }
//----------------------------------------------------------------------------- // TODO: reimplement via new validation API static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check ) { unsigned int subgA = 0; unsigned int subgB = 0; unsigned int regUse = 0; unsigned int itemsPerSubg = 0; DUMMY_ARG_USAGE(subdimsNum); if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } subgA = subdims[0].y/subdims[1].y; subgB = subdims[0].x/subdims[1].x; itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; if( itemsPerSubg < 4 ){ return false; } if( subdims[1].y < 4 || subdims[1].x < 4 || subdims[1].bwidth < 4 ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 16 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 || subdims[1].x > 16 || subdims[1].y > 16 ){ return false; } // estimate register usage, drop // inevitably slowed decompositions regUse = ( subdims[1].bwidth * subdims[1].x + subdims[1].bwidth * subdims[1].y + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register if( regUse >= 64 ){ return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgDim != 1 ){ return false; } if( pgran->wgSize[0] != 64 ){ return false; } if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[0] = subgA * subgB * itemsPerSubg; } return true; }