Ejemplo n.º 1
0
/*
 * isFitToLDS()
 *
 * 1. We will assume "dim[0].y" as the TRIANGLE_HEIGHT oiow - The number of variables solved
 *    by the corresponding TRTRI kernel
 *
 * NOTE:
 * 1. It is Possible that this function can cause "dim[0].y" to change from what was used in
 *    the "trtri" counterpart.
 *    In such a case, we will detect this in "xtrsv.c" and abort the TRSV call.
 * 2. We may need to mellow down the bloated numbers we are returning down here.
 */
static bool
isFitToLDS(
    SubproblemDim *dim,
    DataType dtype,
    cl_ulong ldsSize,
    const void *kernelArgs)
{
    CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs;
    size_t MAXBLOCKSIZE = 256;
    cl_ulong maxSize;

    if  (
            ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) ||
            ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor))
        )
    {
        //
        // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type?
        //
        cl_ulong tw;

        tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
        if (tw == 0)
        {
            do {
                MAXBLOCKSIZE /= 2;
                tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4);
            } while((MAXBLOCKSIZE > 1) && (tw == 0));
        }
        #ifdef DEBUG_TRSV_GEMV
        printf("TRSV GEMV: isFitLDS() tw = %lu\n", tw);
        #endif
        maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4;
        #ifdef DEBUG_TRSV_GEMV
        printf("TRSV GEMV: isFitLDS() maxSize = %lu, ldsSize = %lu, Y = %lu\n", maxSize, ldsSize, dim[0].y);
        #endif
        return (maxSize < ldsSize);
    }

    //
    // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS.
    // We will assume "dim[0].y" to be the "TriangleWidth"
    //
	MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y;
    maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype);
    return (maxSize < ldsSize);
}
Ejemplo n.º 2
0
clblasStatus VISIBILITY_HIDDEN
checkVectorSizes(
    DataType dtype,
    size_t N,
    cl_mem x,
    size_t offx,
    int incx,
    ErrorCodeSet err )
{
    size_t memSize, sizev;
    size_t tsize;

    if (N == 0) {
        return clblasInvalidDim;
    }

    if (incx == 0) {
        switch( err )
        {
        case X_VEC_ERRSET:
            return clblasInvalidIncX;
        case Y_VEC_ERRSET:
            return clblasInvalidIncY;
        default:
            return clblasNotImplemented;
        }
    }

    if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
            CL_SUCCESS) {
        switch( err )
        {
        case X_VEC_ERRSET:
            return clblasInvalidVecX;
        case Y_VEC_ERRSET:
            return clblasInvalidVecY;
        default:
            return clblasNotImplemented;
        }
    }

    tsize = dtypeSize(dtype);
    sizev = ((N - 1) * abs(incx) + 1) * tsize;
    offx *= tsize;

    if ((offx + sizev > memSize) || (offx + sizev < offx)) {
        switch( err )
        {
        case X_VEC_ERRSET:
            return clblasInsufficientMemVecX;
        case Y_VEC_ERRSET:
            return clblasInsufficientMemVecY;
        default:
            return clblasNotImplemented;
        }
    }

    return clblasSuccess;
}
Ejemplo n.º 3
0
static bool
isFitToLDS(
    SubproblemDim *dim,
    DataType dtype,
    cl_ulong ldsSize,
    const void *kernelArgs)
{
    cl_ulong size;
    const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs;
    size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side);
    return (size * dtypeSize(dtype) <= ldsSize);
}
Ejemplo n.º 4
0
int
copyDataBlockGen(
    struct KgenContext *ctx,
    const SubproblemDim *dim,
    const PGranularity *pgran,
    DataType dtype,
    DBlockCopyDirection dir,
    DBlockCopyFlags flags)
{
    int r;
    GenPriv gpriv;
    unsigned int tsize;

    tsize = dtypeSize(dtype);

    if (dir == DBLOCK_LOCAL_TO_IMAGE ||
        dir == DBLOCK_GLOBAL_TO_IMAGE) {
        size_t rowSize;

        if (dim != NULL) {
            rowSize = tsize * dim->x;
            if (rowSize % sizeof(cl_float4) != 0) {
                // only float4 aligned rows are supported
                return -EINVAL;
            }
        }
        if (flags & DBLOCK_COPY_TRANSPOSE) {
            return -EINVAL;
        }
    }

    memset(&gpriv, 0, sizeof(gpriv));
    gpriv.transp = (flags & DBLOCK_COPY_TRANSPOSE);
    gpriv.packed = (flags & DBLOCK_COPY_PACKED_IMAGE);
    if (dtype != TYPE_COMPLEX_DOUBLE) {
        gpriv.notVectorize = (flags & DBLOCK_COPY_NOT_VECTORIZE);
    }
    if ((flags & DBLOCK_COPY_CONJUGATE) && isComplexType(dtype)) {
        gpriv.conjugate = true;
    }
    initGenPriv(&gpriv, dtype, tsize, dim ,dir, NULL, pgran);

    if (dim) {
        r = copyDBlockOptimGen(ctx, dim, pgran, &gpriv);
    }
    else {
        r = copyDBlockGenericGen(ctx, pgran, &gpriv);
    }
    return r;
}
Ejemplo n.º 5
0
int
generateZeroingFuncs(
    ZeroFuncs *funcNames,
    struct KgenContext *ctx,
    const SubproblemDim *blasDim,
    const PGranularity *pgran,
    DataType dtype,
    ZeroGenHelperFlags flags)
{
    int ret = 0;
    SubproblemDim dim[MATRIX_ROLES_NUMBER];
    size_t tsize, nvecs;
    unsigned int i, j;

    tsize = dtypeSize(dtype);
    nvecs = fl4RowWidth(blasDim->bwidth, tsize);

    checkInitSubdim(&dim[MATRIX_A], flags, ZF_MATRIX_A, nvecs * blasDim->y, 1);
    checkInitSubdim(&dim[MATRIX_B], flags, ZF_MATRIX_B, nvecs * blasDim->x, 1);
    nvecs = fl4RowWidth(blasDim->x, tsize);
    checkInitSubdim(&dim[MATRIX_C], flags, ZF_MATRIX_C, nvecs * blasDim->y, 1);

    for (i = 0; (i < MATRIX_ROLES_NUMBER) && !ret; i++) {
        if (dim[i].x == SUBDIM_UNUSED) {
            continue;
        }

        // check whether the function is already generated
        j = lookupDim(dim, i);
        if (j != IDX_INVAL) {
            strcpy(funcNames->names[i], funcNames->names[j]);
        }
        else {
            ret = f4zeroBlockGen(ctx, &dim[i], pgran, "__local");
            if (!ret) {
                kgenGetLastFuncName(funcNames->names[i], FUNC_NAME_MAXLEN,
                                    ctx);
            }
            kgenAddBlankLine(ctx);
        }
    }

    return ret;
}
Ejemplo n.º 6
0
static bool
isFitToLDS(
    SubproblemDim *dim,
    DataType dtype,
    cl_ulong ldsSize,
    const void *kernelArgs)
{
    cl_ulong size;
    (void)kernelArgs;

    /*
     * One needs y1 * wgSize size of local memory in elements,
     * but y1 is not calculated yet. The expression below produces
     * reliable a larger value. It is larger in dims[1].bwidth times.
     */
    size = dim[0].y * dim[0].bwidth * dtypeSize(dtype);

    return (size <= ldsSize);
}
Ejemplo n.º 7
0
clblasStatus VISIBILITY_HIDDEN
checkBandedMatrixSizes(
    DataType dtype,
    clblasOrder order,
    clblasTranspose transA,
    size_t M,
    size_t N,
    size_t KL,
    size_t KU,
    cl_mem A,
    size_t offA,
    size_t lda,
    ErrorCodeSet err )
{
    size_t memSize, matrSize, tsize, K, memUsed;
    size_t unusedTail = 0;
    bool tra;

    if ((M == 0) || (N == 0)) {
        return clblasInvalidDim;
    }

    tsize = dtypeSize(dtype);
    K = KL + KU + 1;
    tra = (order == clblasRowMajor && transA != clblasNoTrans) ||
          (order == clblasColumnMajor && transA == clblasNoTrans);

    if (lda < K) {
        switch( err )
        {
        case A_MAT_ERRSET:
            return clblasInvalidLeadDimA;
        case B_MAT_ERRSET:
            return clblasInvalidLeadDimB;
        case C_MAT_ERRSET:
            return clblasInvalidLeadDimC;
        default:
            return clblasNotImplemented;
        }
    }

    if (tra) {
        matrSize = ((N - 1) * lda + K) * tsize;
        unusedTail = ( lda - N ) * tsize;
    }
    else {
        matrSize = ((M - 1) * lda + K) * tsize;
        unusedTail = ( lda - M ) * tsize;
    }

    offA *= tsize;

    if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) !=
            CL_SUCCESS) {
        switch( err )
        {
        case A_MAT_ERRSET:
            return clblasInvalidMatA;
        case B_MAT_ERRSET:
            return clblasInvalidMatB;
        case C_MAT_ERRSET:
            return clblasInvalidMatC;
        default:
            return clblasNotImplemented;
        }
    }

    // Calculates the memory required. Note that 'matrSize' already takes into account the fact that
    // there might be an unused tail, i.e. the elements between lda and M in the last column if
    // column major is used or between lda and N in the last row if row major is used.
    memUsed = offA + matrSize;
    if (memUsed > memSize) {
        switch( err )
        {
        case A_MAT_ERRSET:
            return clblasInsufficientMemMatA;
        case B_MAT_ERRSET:
            return clblasInsufficientMemMatB;
        case C_MAT_ERRSET:
            return clblasInsufficientMemMatC;
        default:
            return clblasNotImplemented;
        }
    }

    return clblasSuccess;
}
Ejemplo n.º 8
0
/*
 * Assign a scalar multiplied on a matrix a kernel argument
 */
void VISIBILITY_HIDDEN
assignScalarKarg(KernelArg *arg, const void *value, DataType dtype)
{
    arg->typeSize = dtypeSize(dtype);
    memcpy(arg->arg.data, value, arg->typeSize);
}
Ejemplo n.º 9
0
static void
declareLocalVariables(
    struct KgenContext *ctx,
    const BlasGenSettings *gset,
    Tile* parTile,
    TrsmExtraParams * extraParams)
{
    char tmp[1024];
    const SubproblemDim *dims = gset->subdims;
    const char* parTileTypeName = NULL;
    bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags,
                                   MATRIX_B);
    unsigned int locWidth;
    unsigned int tsize;
    unsigned int parTileSize;
    unsigned int l1Pans;
    unsigned int step;

    kgenAddStmt(ctx,
                 "const int lid = get_local_id(0);\n"
                 "const int gid = get_group_id(0);\n"
                 "GPtr uA, uB;\n"
                 "uint coordA, coordB;\n"
                 "uint m0 = 0, k0, m1;\n");

    if (isMatrixUpper(gset->kextra->flags)) {
        sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n",
                dims[0].y, dims[0].y);
        kgenAddStmt(ctx, tmp);
    }

    /*
     * Declare private blocks.
     * The region 'b' stores in different time tiles of both
     * the input matrices and the result
     */

    declareTileStorages(ctx, gset);

    *parTile = gset->tileBX;

    if (extraParams->ldsUse) {
        tsize = dtypeSize(gset->kextra->dtype);
        l1Pans = (unsigned int)(dims[0].x / dims[1].x);

        parTile->vecLen = (trb) ? (unsigned int)dims[1].x
                                : (unsigned int)dims[1].bwidth;
        parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize);
        parTile->trans = trb;

       /*
        * Allocate enough space in the local area to fit several tiles
        * at the stage1 (according to the unrolled factor) and one tile
        * at the stage2
        */

        locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor;
        if (extraParams->ldsUse & LDS_USE_DIAGONAL) {
            locWidth = umax(locWidth, (unsigned int)dims[1].y);
        }
        if (trb) {
            parTile->nrRows = locWidth;
            parTile->nrCols = (unsigned int)dims[0].x;
            step = (unsigned int)dims[1].x / parTile->vecLen;
        }
        else {
            parTile->nrRows = (unsigned int)dims[0].x;
            parTile->nrCols = locWidth;
            step = (unsigned int)dims[1].x * locWidth / parTile->vecLen;
        }

        parTileSize = tileVectorsNum(parTile);

        getVectorTypeName(gset->kextra->dtype, parTile->vecLen,
                          &parTileTypeName, NULL);

        sprintf(tmp, "__local %s tmpB[%i];\n"
                     "LPtr lB;\n"
                     "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n",
                parTileTypeName, parTileSize, l1Pans, step);
        kgenAddStmt(ctx, tmp);

        if (useSkewedFetchB(gset)) {
            kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n",
                       l1Pans, gset->subdims[1].x);
        }
    }

    kgenAddBlankLine(ctx);
}
Ejemplo n.º 10
0
static void
fixupArgs(void *args, SubproblemDim *subdims, void *extra)
{
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    CLBlasKargs *kargs = (CLBlasKargs*)args;
    TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv;
    size_t loadBatch;
    unsigned int wgSize;
    unsigned int workRatio;
    unsigned int ldsUse = LDS_NO_USE;
    KernelExtraFlags kflags = kextra->flags;
    SubproblemDim globDim;
    bool isAmdGPU;

    /*
     * Calculate size of the batch loaded from global to local memory
     * at each iteration of the stage 1. Choose such unrolling factor
     * that allow each work item to load at least 16 bytes that provides
     * efficient global memory access
     */
    loadBatch = subdims[0].x * subdims[1].bwidth * dtypeSize(kargs->dtype);
    wgSize = (unsigned int)((subdims[0].x / subdims[1].itemX) *
                            (subdims[0].y / subdims[1].itemY));
    if (loadBatch < wgSize) {
        workRatio = 1;
    }
    else {
        workRatio = 16 / ((unsigned int)loadBatch / wgSize);
        if (!workRatio) {
            workRatio = 1;
        }
    }

#ifndef NDEBUG
    {
        const char *envImpl = getenv("AMD_CLBLAS_TRSM_LDSUSE");

        if (envImpl != NULL) {
            unsigned int w = atoi(envImpl);
            ldsUse = w % 10;
            w = w / 10;
            workRatio = w > 0 ? w : workRatio;
        }
    }
#endif

    ldsUse = LDS_NO_USE;
    isAmdGPU = ((kflags & KEXTRA_VENDOR_AMD) != 0);
    if ((isAmdGPU && !(kflags & (KEXTRA_TAILS_K_LOWER | KEXTRA_TAILS_M_LOWER)))
        || (!isAmdGPU && !(kflags & KEXTRA_TAILS_M))) {

        ldsUse = LDS_USE_LARGE;
    }

    kargsToProbDims(&globDim, CLBLAS_TRSM, args, false);
    extraParams->ldsUse = ldsUse;
    extraParams->unrollingFactor = workRatio;
    extraParams->unrolledTail = (unsigned int)(((globDim.bwidth %
             (subdims[1].bwidth * workRatio)) + subdims[1].bwidth - 1) /
                                                subdims[1].bwidth);

    fixupTrxmKargs(kargs);
}
Ejemplo n.º 11
0
// global memory based kernel generator
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    char tmp[4096], tmp1[4096];
    char *p;
    // is the iteration over N, N at the top level
    const char *typeName;
    char fpref;
    DataType dtype = kextra->dtype;
    ssize_t ret;
    BlasGenSettings gset;
    BlkMulOpts mulOpts;
    unsigned int tsize;
    unsigned int vecLen, outVecLen;
    bool b;
    const char *outTypeName;
    unsigned int i;
    unsigned int nrRegs, regPitch;
    int tra, trb;
    char vect[2] = {'y', 'x'};

    const char *coordConstants =
        "const uint workItemM = get_global_id(0) * %lu;\n"
        "const uint workItemN = get_global_id(1) * %lu;\n"
        "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n"
        "uint vectK = (K + %u) / %u;\n";

    /*
     *  template for image based gemm preparation part
     *  for two dimensional work space
     */
    const char *localVariables =
        "uint k0;\n"
        "int2 coordA = (int2)(0, workItemM);\n"
        "int2 coordB = (int2)(0, workItemN);\n"
        "%s c[%u];\n\n";

    tsize = dtypeSize(dtype);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    if (isComplexType(dtype)) {
        regPitch = (unsigned int)subdims[1].x;
    }
    else {
        regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) *
                    sizeof(cl_float4) / tsize;
    }

    memset(&gset, 0, sizeof(gset));
    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;
    initKernelVarNames(&gset.varNames, kextra->flags);

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    // at first, generate needed declarations and auxiliary functions
    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);

    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    // now, generate the kernel

    sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref,
            typeName, typeName, typeName);
    kgenDeclareFunction(ctx, tmp);
    ret = kgenBeginFuncBody(ctx);

    // constants
    sprintf(tmp, coordConstants,
            subdims[1].y, subdims[1].x, subdims[1].y,
            vecLen - 1, vecLen);
    kgenAddStmt(ctx, tmp);

    /*
     * Calculate local buffer pitches, and then declare local
     * variables
     */
    getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName);

    sprintf(tmp, localVariables, outTypeName, nrRegs);
    kgenAddStmt(ctx, tmp);

    // check if offset exceeds matrix
    kgenAddStmt(ctx, "if ((workItemM >= M) ||"
                         "(workItemN >= N)) {\n"
                     "    return;\n"
                     "}\n");

    kgenAddStmt(ctx, "C += offsetC;\n");

    // zero C block
    sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n"
                 "    c[k0] = 0;\n"
                 "}\n\n",
            nrRegs);
    kgenAddStmt(ctx, tmp);

    // block multiplication inlined function
    sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)",
            subdims[1].bwidth / vecLen);
    kgenBeginBranch(ctx, tmp);

    mulOpts.aMobj = CLMEM_IMAGE;
    mulOpts.bMobj = CLMEM_IMAGE;
    mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE;
    if (isComplexType(dtype)) {
        mulOpts.core = BLKMUL_SEPARATE_MULADD;
    }
    else {
        mulOpts.core = BLKMUL_MAD;
    }
    mulOpts.argNames.coordA = "coordA";
    mulOpts.argNames.coordB = "coordB";
    mulOpts.argNames.skewCol = "skewCol";
    mulOpts.argNames.skewRow = "skewRow";
    mulOpts.argNames.k = "k0";
    mulOpts.argNames.vectBoundK = "vectK";
    ret = blkMulGen(ctx, subdims, dtype, &mulOpts);
    if (ret) {
        destroyKgenContext(ctx);
        return -EOVERFLOW;
    }

    // update image coordinates
    sprintf(tmp, "\ncoordA.x += %lu;\n"
                 "coordB.x += %lu;\n",
            subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen);
    kgenAddStmt(ctx, tmp);

    kgenEndBranch(ctx, NULL);

    // reorder the given solution
    outVecLen = isComplexType(dtype) ? 1 : vecLen;
    p = tmp1;
    for (i = 0; i < regPitch / outVecLen; i++) {
        unsigned int k = (unsigned int)(subdims[1].y - 1) *
                         regPitch / outVecLen + i;

        sprintf(p,  "\n"
                    "    tmp = c[%u];\n"
                    "    for (j = %lu; j >= 0; j--) {\n"
                    "        c[(j+1) * %u + %u] = c[j * %u + %u];\n"
                    "    }\n"
                    "    c[%u] = tmp;\n",
                k, subdims[1].y - 2, regPitch / outVecLen,
                i, regPitch / outVecLen, i, i);
        p += strlen(p);
    }
    sprintf(tmp, "\n"
                 "for (k0 = 0; k0 < skewRow.y; k0++) {\n"
                 "    int j;\n"
                 "    %s tmp;\n"
                 "%s"
                 "}\n"
                 "\n",
                 outTypeName, tmp1);
    kgenAddStmt(ctx, tmp);

    tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A);
    trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B);
    sprintf(tmp, "coordA.%c = workItemM;\n"
                 "coordB.%c = workItemN;\n\n",
            vect[tra], vect[trb]);
    kgenAddStmt(ctx, tmp);

    // write back the tile evaluated
    generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL);

    kgenEndFuncBody(ctx);
    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }

    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
Ejemplo n.º 12
0
// Preparation function for images based kernel generator
static ssize_t
preparator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{
    struct KgenContext *ctx;
    char tmp[4096], conjStr[1024];
    CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra;
    CopyImgFuncs copyImgFuncs;
    DataType dtype = kextra->dtype;
    BlasGenSettings gset;
    unsigned int vecLen;
    unsigned int tsize;
    const char *typeName;
    char fpref;
    bool b;
    size_t localBufSize;
    ssize_t ret;
    const char *conjCond;

    const char *functionHeadA =
        "int tra, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "uint m = (gid / bpr) * %lu;\n"
        "uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "A += offsetA;\n"
        "tra = (!transA && order == clblasColumnMajor) ||\n"
        "      (transA && order == clblasRowMajor);\n"
        "if (m >= M) {\n"
        "     return;\n"
        "}\n";

    const char *functionHeadB =
        "int trb, aligned;\n"
        "const uint bpr = (K + %lu) / %lu;\n"
        "const uint n = (gid / bpr) * %lu;\n"
        "const uint k = (gid %% bpr) * %lu;\n"
        "uint x, y;\n"
        "__local %s temp[%lu];\n"
        "\n"
        "B += offsetB;\n"
        "trb = (!transB && order == clblasRowMajor) ||\n"
        "      (transB && order == clblasColumnMajor);\n"
        "if (n >= N) {\n"
        "    return;\n"
        "}\n";

    // Distribute blocks across compute units and copy matrix A to image.
    // Transposition and filling with zeros in unaligned cases is made using
    // buffer in local memory.
    const char *copyToImageA =
        "//copy matrix A block\n"
        "y = m + %u <= M ? %u : M - m;\n"
        "x = k + %u <= K ? %u : K - k;\n"
        "aligned = (x == %u) && (y == %u) && %d;\n"
        "int atcase = aligned * 10 + tra;\n"
        "%s" // conjugated check
        "if (atcase != 10) {\n"
        "    %s((__local float4*)temp);\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "}\n"
        "switch(atcase) {\n"
        "case 10: //aligned, not transposed\n"
        "    %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n"
        "    break;\n"
        "%s" // conjugated case
        "case 1: //not aligned, transposed\n"
        "    // generic transposed global to local\n"
        "    %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n"
        "    break;\n"
        "case 0: //not aligned, not transposed\n"
        "    // generic global to local\n"
        "    %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n"
        "    break;\n"
        "case 11: //aligned, transposed\n"
        "    // optimized transposed global to local\n"
        "    %s((LPtr) temp, (GPtr)A, k, m, lda);\n"
        "    break;\n"
        "}\n"
        "if (atcase != 10) {\n"
        "    barrier(CLK_LOCAL_MEM_FENCE);\n"
        "    %s(imgA, k / %u, m, (LPtr) temp);\n"
        "}\n"
        "\n";

    const char *copyToImageB =
            "//copy matrix B block\n"
            "y = n + %u <= N ? %u : N - n;\n"
            "x = k + %u <= K ? %u : K - k;\n"
            "aligned = (x == %u) && (y == %u) && %d;\n"
            "int atcase = aligned * 10 + trb;\n"
            "%s" // conjugated check
            "if (atcase != 10) {\n"
            "    %s((__local float4*)temp);\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "}\n"
            "switch (atcase) {\n"
            "case 10: //aligned, not transposed\n"
            "    %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n"
            "    break;\n"
            "%s" // conjugated case
            "case 1: //not aligned, transposed\n"
            "    // generic transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n"
            "    break;\n"
            "case 0: //not aligned, not transposed\n"
            "    // generic global to local\n"
            "    %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n"
            "    break;\n"
            "case 11: //transposed, aligned\n"
            "    // optimized transposed global to local\n"
            "    %s((LPtr)temp, (GPtr)B, k, n, ldb);\n"
            "    break;\n"
            "}\n"
            "if (atcase != 10) {\n"
            "    barrier(CLK_LOCAL_MEM_FENCE);\n"
            "    %s(imgB, k / %u, n, (LPtr)temp);\n"
            "}\n"
            "\n";

    memset(&copyImgFuncs, 0, sizeof(copyImgFuncs));
    memset(&gset, 0, sizeof(gset));

    ctx = createKgenContext(buf, buflen, true);
    if (ctx == NULL) {
        return -ENOMEM;
    }

    tsize = dtypeSize(dtype);

    b = isDoubleBasedType(dtype);
    kgenDeclareUptrs(ctx, b);
    declareBlasEnums(ctx);

    memcpy(gset.subdims, subdims, sizeof(gset.subdims));
    gset.kextra = kextra;
    gset.pgran = pgran;

    // generate necessary memory to image copying functions
    generateImageCopyFuncs(&copyImgFuncs, ctx, CLBLAS_GEMM, &gset);

    kgenAddBlankLine(ctx);
    vecLen = sizeof(cl_float4) / dtypeSize(dtype);
    typeName = dtypeBuiltinType(dtype);
    fpref = dtypeToBlasPrefix(dtype);

    if (kextra->kernType == CLBLAS_PREP_A_KERNEL) {
        sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadA,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].y, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transA == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)A, m, k, lda);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_A]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageA,
                subdims[1].y, subdims[1].y, // y = m + dy <= M ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_A],
                copyImgFuncs.globalToImage[MATRIX_A],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_A],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_A],
                copyImgFuncs.localToImage[MATRIX_A],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }
    else { // PREP_B
        sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName);
        kgenDeclareFunction(ctx, tmp);
        ret = kgenBeginFuncBody(ctx);

        // same local buffer is used for both matrix A and matrix B blocks
        localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize);
        localBufSize *= vecLen;

        kgenDeclareGroupID(ctx, "gid", pgran);
        sprintf(tmp, functionHeadB,
                subdims[1].bwidth - 1, subdims[1].bwidth,
                subdims[1].x, subdims[1].bwidth,
                typeName, localBufSize);
        kgenAddStmt(ctx, tmp);

        if (isComplexType(dtype)) {
            conjCond = "atcase += ((atcase == 10) && "
                    "(transB == clblasConjTrans)) ? 100 : 0;\n";
            sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n"
                             "    %s((LPtr)temp, (GPtr)B, n, k, ldb);\n"
                             "    break;\n",
                    copyImgFuncs.globalToLocal[MATRIX_B]);
        }
        else {
            conjCond = "";
            strcpy(conjStr, "");
        }

        sprintf(tmp, copyToImageB,
                subdims[1].x, subdims[1].x, // y = n + dy <= N ?...
                subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?...
                subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1)
                (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0,
                conjCond,
                copyImgFuncs.zeroBlock[MATRIX_B],
                copyImgFuncs.globalToImage[MATRIX_B],
                vecLen,
                conjStr,
                copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalGeneric[MATRIX_B],
                subdims[1].bwidth,
                copyImgFuncs.globalToLocalTransposed[MATRIX_B],
                copyImgFuncs.localToImage[MATRIX_B],
                vecLen);
        kgenAddStmt(ctx, tmp);
    }

    kgenEndFuncBody(ctx);

    ret = kgenAddBlankLine(ctx);

    if (!ret) {
        ret = (ssize_t)kgenSourceSize(ctx) + 1;
    }
    destroyKgenContext(ctx);

    return (ret < 0) ? -EOVERFLOW : ret;
}
Ejemplo n.º 13
0
static bool
subgCheckCalcDecomp(
    PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    DataType dtype,
    int check)
{
    unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float);

    //EINVAL
    if( (subdimsNum<2)||
        (NULL==pgran)||
        (NULL==subdims) ){

        return false;
    }

    if( 0 == subdims[0].x ||
        0 == subdims[0].y ||
        0 == subdims[0].bwidth ||
        0 == subdims[1].x ||
        0 == subdims[1].y ||
        0 == subdims[1].bwidth ){

        return false;
    }

    if( subdims[1].x != subdims[1].itemX ||
        subdims[1].y != subdims[1].itemY ){

        return false;
    }

    // the group block must consist of integer number of subgroup blocks
    if( subdims[0].x % subdims[1].x ||
        subdims[0].y % subdims[1].y ||
        subdims[0].bwidth % subdims[1].bwidth ){

        return false;
    }

    //check fitting of bw to common vector sizes
    if( isComplexType(dtype) ){

        if( 2*subdims[1].bwidth > 32 ){

            return false;
        }
    }

    // check dimensions
    if( subdims[1].bwidth > 16 / divider1 ||
        subdims[1].x > 1 ||
        subdims[1].y > 16 / divider1 ){

        return false;
    }

    if( subdims[0].bwidth > 128 ||
        subdims[0].x > 1 ||
        subdims[0].y > 128 ){

        return false;
    }

    if (64 != (subdims[0].y / subdims[1].y) *
        (subdims[0].bwidth / subdims[1].bwidth)) {
        return false;
    }

    if (subdims[0].y > subdims[0].bwidth &&
        subdims[0].y / subdims[0].bwidth < (subdims[0].bwidth / subdims[1].bwidth)) {
        return false;
    }

    // passed PGranularity should be checked
    if( PGRAN_CHECK == check ){
        if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){
            return false;
        }
    }
    // PGranularity should be calculated
    else{
        pgran->wgDim = 1;
        pgran->wgSize[1] = 1;
        pgran->wgSize[0] = 64;
        //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) /
        //    (subdims[0].y / subdims[1].y);
    }
    /*Debug out for Tune*/

    return true;
}
Ejemplo n.º 14
0
int
generateImageCopyFuncs(
    CopyImgFuncs *copyFuncs,
    struct KgenContext *ctx,
    BlasFunctionID funcID,
    const BlasGenSettings *gset)
{
    const SubproblemDim *dims = gset->subdims;
    KernelExtraFlags kflags = gset->kextra->flags;
    DataType dtype = gset->kextra->dtype;
    const PGranularity *pgran = gset->pgran;
    CopyPattern pattern;
    // mandatory flags for global to local copying
    DBlockCopyFlags glcpFlags[2] = {0, 0};
    struct KgenGuard *guard;
    unsigned int tsize;
    int ret = 0;
    bool isTra, areTails, isConjA;
    bool customize;

    if (kflags & KEXTRA_NO_COPY_VEC_A) {
        glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE;
    }
    if (kflags & KEXTRA_NO_COPY_VEC_B) {
        glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE;
    }

    tsize = dtypeSize(dtype);
    isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A);
    isConjA = isMatrixConj(kflags, MATRIX_A);
    areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N));
    customize = (funcID == CLBLAS_TRMM);

    guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern));
    if (guard == NULL) {
        return -ENOMEM;
    }

    memset(&pattern, 0, sizeof(pattern));

    pattern.zeroing = false;
    pattern.dim = dims[0];
    pattern.dir = DBLOCK_GLOBAL_TO_IMAGE;
    pattern.dtype = dtype;
    pattern.flags = 0;
    pattern.generic = false;
    pattern.pgran = pgran;

    if (!(customize && (isTra || isConjA))) {
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[0].y;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[0].x;
    findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[1].y;
    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = dims[0].bwidth;
    pattern.dim.y = dims[1].x;
    pattern.dir = DBLOCK_LOCAL_TO_IMAGE;
    findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // Global to local optimized
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    if (customize || isComplexType(dtype)) {
        pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
        pattern.flags |= glcpFlags[0];
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[1].y;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) {
        pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1];
        pattern.dim.x = dims[0].bwidth;
        pattern.dim.y = dims[1].x;
        findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    // Global to local generic
    pattern.dim = dims[0];
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    pattern.generic = true;
    if (!customize || areTails) {
        pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0;
        pattern.flags |= glcpFlags[0];
        findGenerateFunction(guard, &pattern,
                             copyFuncs->globalToLocalGeneric[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0;
    pattern.flags |= glcpFlags[1];
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalGeneric[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // Global to local transposed functions
    pattern.dir = DBLOCK_GLOBAL_TO_LOCAL;
    pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ?
                    DBLOCK_COPY_NOT_VECTORIZE : 0;
    pattern.flags |= glcpFlags[0];
    if (!customize || isTra) {
        pattern.generic = false;
        if (isConjA) {
            pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
        }
        else {
            pattern.flags |= DBLOCK_COPY_TRANSPOSE;
        }
        pattern.dim.x = dims[1].y;
        pattern.dim.y = dims[0].bwidth;

        findGenerateFunction(guard, &pattern,
                             copyFuncs->globalToLocalTransposed[MATRIX_A],
                             FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    if (!customize || (isTra && areTails)) {
        pattern.generic = true;
        pattern.dim.x = 0;
        pattern.dim.y = 0;
        findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_A],
                         FUNC_NAME_MAXLEN);
        kgenAddBlankLine(ctx);
    }

    pattern.generic = false;
    pattern.dim.x = dims[1].x;
    pattern.dim.y = dims[0].bwidth;
    if (kflags & KEXTRA_CONJUGATE_B) {
        pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE;
    }
    else {
        pattern.flags = DBLOCK_COPY_TRANSPOSE;
    }
    pattern.flags |= glcpFlags[1];
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposed[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.generic = true;
    pattern.dim.x = 0;
    pattern.dim.y = 0;
    findGenerateFunction(guard, &pattern,
                         copyFuncs->globalToLocalTransposedGeneric[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    // generate two local zeroing functions for matrix A and matrix B blocks
    pattern.zeroing = true;
    pattern.dim = dims[0];
    pattern.generic = false;
    pattern.flags = 0;
    pattern.dim.y = 1;
    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y;

    findGenerateFunction(guard, &pattern,
                         copyFuncs->zeroBlock[MATRIX_A],
                         FUNC_NAME_MAXLEN);
    kgenAddBlankLine(ctx);

    pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x;
    findGenerateFunction(guard, &pattern,
                         copyFuncs->zeroBlock[MATRIX_B],
                         FUNC_NAME_MAXLEN);
    ret = kgenAddBlankLine(ctx);

    destroyKgenGuard(guard);
    return ret;
}
Ejemplo n.º 15
0
cl_int
run (
        const char *ker,
        cl_uint M,
        cl_uint N,
        cl_uint K,
        FType alpha,
        BlasGenSettings *gset,
        TileMulFlags flags,
        cl_device_type deviceType,
        bool verbose,
        unsigned int iterNum)
{
    cl_int err;
    cl_platform_id platform;
    cl_context ctx;
    cl_device_id device;
    cl_command_queue queue;
    cl_event evt;
    DataType dtype = gset->kextra->dtype;

    cl_mem bufA, bufB, bufC;
    FPtr A, B, C, C_naive;
    bool isComplex = isComplexType(dtype);
    bool isDouble = isDoubleBasedType(dtype);
    cl_uint nwords = (isComplex) ? 2 : 1;
    unsigned int tsize = dtypeSize(dtype);
    cl_kernel kernel;
    size_t i, j, k;
    size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N};
    char log[100000];
    size_t logSize;
    cl_long sTime, fTime;
    cl_program program = NULL;

    clGetPlatformIDs(1, &platform, NULL);

    clGetDeviceIDs(platform, deviceType, 1, &device, NULL);

    ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err);
    if (err != CL_SUCCESS) {
        return err;
    }

    /* Prepare OpenCL kernel and its arguments */

    program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL);

    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    clGetProgramBuildInfo (program,
            device,
            CL_PROGRAM_BUILD_LOG,
            sizeof(log),
            log,
            &logSize);
    printf("%s", log);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }

    kernel = clCreateKernel(program, kernelName, &err);
    if (err != CL_SUCCESS){
        clReleaseProgram(program);
        return err;
    }
    /* Memory allocation */

    A.v = malloc(M * K * tsize);
    B.v = malloc(K * N * tsize);
    C.v = malloc(M * N * tsize);
    C_naive.v = malloc(M * N * tsize);

#if JUST_MULTIPLICATION
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = i;
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = i + 7;
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }

#else
    srand(0);
    if (isDouble) {
        for(i = 0; i < M * K * nwords; i++){
            A.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.d[i] = (double)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.d[i] = 0.0;
            C_naive.d[i] = 0.0;
        }
    }
    else {
        for(i = 0; i < M * K * nwords; i++){
            A.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < N * K * nwords; i++){
            B.f[i] = (float)(rand() % RAND_BOUND);
        }
        for(i = 0; i < M * N * nwords; i++){
            C.f[i] = 0.0;
            C_naive.f[i] = 0.0;
        }
    }
#endif

    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * M * tsize, A.v, &err);
    if (err != CL_SUCCESS) {
        clReleaseKernel(kernel);
        return err;
    }

    bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
            K * N * tsize, B.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
        M * N * tsize, C.v, &err);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    /* Argument setting and kernel execution */
    err = clSetKernelArg(kernel, 0, tsize, alpha.u);
    err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA);
    err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB);
    err |= clSetKernelArg(kernel, 3, sizeof(M), &M);
    err |= clSetKernelArg(kernel, 4, sizeof(N), &N);
    err |= clSetKernelArg(kernel, 5, sizeof(K), &K);
    err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC);
    err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
        globalWorkSize, localWorkSize, 0,
        NULL, &evt);

    if (err != CL_SUCCESS) {
        clReleaseMemObject(bufC);
        clReleaseMemObject(bufB);
        clReleaseMemObject(bufA);
        clReleaseKernel(kernel);
        return err;
    }

    err = clFinish(queue);
    err = clEnqueueReadBuffer (queue,
        bufC,
        CL_TRUE,
        0,
        M * N * tsize,
        C.v,
        0,
        NULL,
        NULL);

    /* Naive CPU multiplication */
    if (isDouble) {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_double2 val;
                    for (k = 0; k < K; k++) {
                        cl_double2 bkj = flags & TILEMUL_TRB ?
                                B.d2[j * K + k] : B.d2[k * N + j];
                        cl_double2 aik = flags & TILEMUL_TRA ?
                                A.d2[k * M + i] : A.d2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.d2[i * N + j].s[0] += val.s[0];
                        C_naive.d2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] -
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[1];
                    val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] +
                            C_naive.d2[i * N + j].s[1] * alpha.d2.s[0];
                    C_naive.d2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        double bkj = flags & TILEMUL_TRB ?
                                B.d[j * K + k] : B.d[k * N + j];
                        double aik = flags & TILEMUL_TRA ?
                                A.d[k * M + i] : A.d[i * K + k];
                        C_naive.d[i * N + j] += aik * bkj;
                    }
                    C_naive.d[i * N + j] *= alpha.d;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.d[i] != C_naive.d[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N,
                        C.d[i], C_naive.d[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }
    else {
        for (i = 0; i < M; i++) {
            for (j = 0; j < N; j++) {
                if (isComplex) {
                    cl_float2 val;
                    for (k = 0; k < K; k++) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1];
                        val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0];
                        C_naive.f2[i * N + j].s[0] += val.s[0];
                        C_naive.f2[i * N + j].s[1] += val.s[1];
                    }
                    val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] -
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[1];
                    val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] +
                            C_naive.f2[i * N + j].s[1] * alpha.f2.s[0];
                    C_naive.f2[i * N + j] = val;
                }
                else {
                    for (k = 0; k < K; k++) {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        C_naive.f[i * N + j] += aik * bkj;
                    }
                    C_naive.f[i * N + j] *= alpha.f;
                }
            }
        }

        for (i = 0; i < M * N; i++) {
            if (C.f[i] != C_naive.f[i]) {
                printf("Differ at (%lu, %lu): %lf != %lf\n",
                        i / N, i % N, C.f[i], C_naive.f[i]);
                break;
            }
        }
        if (i == M * N) {
            printf("Match\n");
        }
    }

    /* End of naive CPU multiplication */
    if (verbose) {
        if (!isDouble) {
            printf("Matrix A:\n");
            for (i = 0; i < M; i++) {
                for (k = 0; k < K; k++) {
                    if (isComplex) {
                        cl_float2 aik = flags & TILEMUL_TRA ?
                                A.f2[k * M + i] : A.f2[i * K + k];
                        printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]);
                    }
                    else {
                        float aik = flags & TILEMUL_TRA ?
                                A.f[k * M + i] : A.f[i * K + k];
                        printf("%4.1f ", aik);
                    }
                }
                printf("\n");
            }

            printf("Matrix B:\n");
            for (k = 0; k < K; k++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        cl_float2 bkj = flags & TILEMUL_TRB ?
                                B.f2[j * K + k] : B.f2[k * N + j];
                        printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]);
                    }
                    else {
                        float bkj = flags & TILEMUL_TRB ?
                                B.f[j * K + k] : B.f[k * N + j];
                        printf("%4.1f ", bkj);
                    }
                }
                printf("\n");
            }

            printf("CPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C_naive.f2[i * N + j].s[0],
                                C_naive.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C_naive.f[i * N + j]);
                    }
                }
                printf("\n");
            }

            printf("GPU calculated matrix:\n");
            for (i = 0; i < M; i++) {
                for (j = 0; j < N; j++) {
                    if (isComplex) {
                        printf("(%4.1f, %4.1f) ",
                                C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]);
                    }
                    else {
                        printf("%4.1f ", C.f[i * N + j]);
                    }
                }
                printf("\n");
            }
        }
    }

    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
            &sTime, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),
            &fTime, NULL);

    printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n",
            (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum));

    clReleaseMemObject(bufC);
    clReleaseMemObject(bufB);
    clReleaseMemObject(bufA);
    clReleaseKernel(kernel);
    return CL_SUCCESS;
}
Ejemplo n.º 16
0
//-----------------------------------------------------------------------------
// TODO: reimplement via new validation API
static bool
subgCheckCalcDecomp( PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    DataType dtype,
    int check )
{
    unsigned int subgA = 0;
    unsigned int subgB = 0;
    unsigned int regUse = 0;
    unsigned int itemsPerSubg = 0;

    DUMMY_ARG_USAGE(subdimsNum);

    if( 0 == subdims[0].x ||
        0 == subdims[0].y ||
        0 == subdims[0].bwidth ||
        0 == subdims[1].x ||
        0 == subdims[1].y ||
        0 == subdims[1].bwidth ){

        return false;
    }

    subgA = subdims[0].y/subdims[1].y;
    subgB = subdims[0].x/subdims[1].x;
    itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth;

    if( itemsPerSubg < 4 ){
        return false;
    }

    if( subdims[1].y < 4 ||
        subdims[1].x < 4 ||
        subdims[1].bwidth < 4 ){
        return false;
    }

    if( subdims[1].x != subdims[1].itemX ||
        subdims[1].y != subdims[1].itemY ){

        return false;
    }

    // the group block must consist of integer number of subgroup blocks
    if( subdims[0].x % subdims[1].x ||
        subdims[0].y % subdims[1].y ||
        subdims[0].bwidth % subdims[1].bwidth ){

        return false;
    }

    //check fitting of bw to common vector sizes
    if( isComplexType(dtype) ){

        if( 2*subdims[1].bwidth > 16 ){

            return false;
        }
    }

    // check dimensions
    if( subdims[1].bwidth > 16 ||
        subdims[1].x > 16 ||
        subdims[1].y > 16 ){

        return false;
    }

    // estimate register usage, drop
    // inevitably slowed decompositions
    regUse =
        (   subdims[1].bwidth * subdims[1].x +
            subdims[1].bwidth * subdims[1].y +
            subdims[1].x * subdims[1].y ) *
        dtypeSize(dtype);

    regUse /= 16; // 16 bytes per register

    if( regUse >= 64 ){
        return false;
    }

    // passed PGranularity should be checked
    if( PGRAN_CHECK == check ){

        if( pgran->wgDim != 1 ){
            return false;
        }
        if( pgran->wgSize[0] != 64 ){
            return false;
        }

        if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){
            return false;
        }
    }
    // PGranularity should be calculated
    else{
        pgran->wgDim = 1;
        pgran->wgSize[0] = subgA * subgB * itemsPerSubg;
    }

    return true;
}