Пример #1
0
static void
calcNrThreads(
    size_t threads[2],
    const SubproblemDim *subdims,
    const PGranularity *pgran,
    const void *args,
    const void *_extra)
{
    DUMMY_ARG_USAGE(subdims);
    const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra;
    CLBlasKargs *kargs = (CLBlasKargs *)args;
    SolutionStep *step = container_of(kargs, args, SolutionStep);
    TargetDevice *kDevice = &(step->device);

    cl_int err;
    unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err );
    if(err != CL_SUCCESS) {
        numComputeUnits = 1;
    }

    unsigned int vecLen = extra->vecLenA;
	unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1];

	unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1;
    wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) );

	threads[0] = wgToSpawn * blockSize;
	threads[1] = 1;
}
Пример #2
0
// unrolling generator for the f4zero function
static int
f4zeroSingle(struct KgenContext *ctx, void *priv)
{
    DUMMY_ARG_USAGE(priv);

    return kgenAddStmt(ctx, "*data++ = 0;\n");
}
Пример #3
0
static bool
checkCalcDecompDedicated(
    PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    DataType dtype,
    int check)
{
    bool ret = true;

    DUMMY_ARG_USAGE(subdimsNum);

    if (check == PGRAN_CHECK) {
        unsigned int minSize, maxSize;

        maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8;
        minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2;
        ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true);
        ret = ret && (subdims[0].bwidth == subdims[1].bwidth);
        ret = ret && (pgran->wgSize[0] == 64);
    }
    else {
        calcPgranDedicated(pgran, subdims, -1, 3);
    }

    return ret;
}
Пример #4
0
static int trmmGetDefaultDecomp( PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    void *pArgs)
{
    DUMMY_ARG_USAGE(subdimsNum);

    if ( NULL == pArgs ) {
        return -EINVAL;
    }

    subdims[1].bwidth = 2;
    subdims[1].x = subdims[1].itemX = 8;
    subdims[1].y = subdims[1].itemY = 8;

    subdims[0].bwidth = 2;
    subdims[0].x = subdims[0].itemX = 32;
    subdims[0].y = 128;
    subdims[0].itemY = -1;

    pgran->wgDim = 1;
    pgran->wgSize[0] = 64;
    pgran->wgSize[1] = 1;

    return 0;
}
Пример #5
0
/** The purpose of this function is to add an work-group size indicator in
    kernelKey, so that a different kernel is generated when work-group size is changed.
    Reduction loop is unrolled in kprintf based on work-group size.

    Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel
    this will become a kernelKey, and kernel cache will be accordingly managed.
    Note -- SubproblemDim is a member of kernelKey
**/
static void
fixupArgs(void *args, SubproblemDim *subdims, void *extra)
{
    DUMMY_ARG_USAGE(extra);
    CLBlasKargs *kargs = (CLBlasKargs*)args;
    SolutionStep *step = container_of(kargs, args, SolutionStep);

    subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]);
}
Пример #6
0
static int
copyMemPreUnroll(struct KgenContext *ctx, void *priv)
{
    DUMMY_ARG_USAGE(priv);

    kgenAddStmt(ctx, "src1 = src;\n");

    return kgenAddStmt(ctx, "dst1 = dst;\n\n");
}
Пример #7
0
//
// FIXME: Report correct return value - Needs change in KPRINTF
//
static ssize_t
generator(
   char *buf,
   size_t buflen,
   const struct SubproblemDim *subdims,
   const struct PGranularity *pgran,
   void *extra)
{

	DUMMY_ARG_USAGE(subdims);
	size_t BLOCKSIZE  = pgran->wgSize[0];
	char tempTemplate[32*1024];

	if ( buf == NULL) // return buffer size
	{
		buflen = (32 * 1024 * sizeof(char));
        return (ssize_t)buflen;
	}
	CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra;

	#ifdef DEBUG_ASUM
 	printf("ASUM GENERATOR called....\n");
	printf("dataType : %c\n", Prefix[extraFlags->dtype]);
	#endif

    unsigned int vecLenA = extraFlags->vecLenA;

	#ifdef DEBUG_ASUM
	printf("Vector length used : %d\n\n", vecLenA);
	#endif

	bool doVLOAD = false;
	if( extraFlags->flags &  KEXTRA_NO_COPY_VEC_A )
	{
		doVLOAD = true;
		#ifdef DEBUG_ASUM
		printf("DOing VLOAD as Aligned Data Pointer not Availabe\n");
		#endif
	}
	else
	{
		#ifdef DEBUG_ASUM
		printf("Using Aligned Data Pointer \n");
		#endif
	}
    strcpy( tempTemplate, (char*)asum_kernel );
	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE);
    kobj.spit((char*)buf, tempTemplate);

    return (32 * 1024 * sizeof(char));
}
Пример #8
0
static int
subgGetPerf( unsigned int kflags,
    const void *args )
{
    DUMMY_ARG_USAGE(args);

    if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) &&
        !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){

        return PPERF_GOOD;
    }

    return PPERF_NOT_SUPPORTED;
}
Пример #9
0
static int trmmSubgGetDefaultDecomp( PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    void *pArgs)
{
    int itemsPerSubg = 4;
    int subgA = 8;
    int subgB = 2;

    int bw1 = 8;
    int x1 = 4;
    int y1 = 4;
    CLBlasKargs *kargs;

    DUMMY_ARG_USAGE(subdimsNum);

    if ( NULL == pArgs ) {
        return -EINVAL;
    }

    kargs = (CLBlasKargs *)pArgs;

    if( isComplexType(kargs->dtype) ){
        bw1 /= 2;
    }
    if( isDoubleBasedType(kargs->dtype) ){
        bw1 /= 2;
    }

    subdims[1].bwidth = bw1;
    subdims[1].x = subdims[1].itemX = x1;
    subdims[1].y = subdims[1].itemY = y1;

    subdims[0].bwidth = bw1 * itemsPerSubg;
    subdims[0].itemX = x1 * subgB;
    subdims[0].x = x1*subgB;

    subdims[0].itemY = y1*subgA;
    subdims[0].y = y1*subgA;

    pgran->wgDim = 1;
    pgran->wgSize[0] = 64;
    pgran->wgSize[1] = 1;

    return 0;
}
Пример #10
0
static int
symvSubgGetDefaultDecomp(
    PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    void * pArgs)
{
    (void)subdimsNum;
    DUMMY_ARG_USAGE(pArgs);

    pgran->wgDim = 1;
    pgran->wgSize[0] = 64;
    pgran->wgSize[1] = 1;

    subdims[1].bwidth = 4;
    subdims[1].itemX = subdims[1].x = 1;
    subdims[1].itemY = subdims[1].y = 4;

    subdims[0].bwidth = 8 * subdims[1].bwidth;
    subdims[0].itemX = subdims[0].x = 1;
    subdims[0].itemY = subdims[0].y = 8 * subdims[1].y;

    return 0;
}
Пример #11
0
 void initialize_scalars(double alpha, double beta)
 {
     DUMMY_ARG_USAGE(beta);
     buffer_.alpha_ = makeScalar<T>(alpha);
 }
Пример #12
0
//-----------------------------------------------------------------------------
// TODO: reimplement via new validation API
static bool
subgCheckCalcDecomp( PGranularity *pgran,
    SubproblemDim *subdims,
    unsigned int subdimsNum,
    DataType dtype,
    int check )
{
    unsigned int subgA = 0;
    unsigned int subgB = 0;
    unsigned int regUse = 0;
    unsigned int itemsPerSubg = 0;

    DUMMY_ARG_USAGE(subdimsNum);

    if( 0 == subdims[0].x ||
        0 == subdims[0].y ||
        0 == subdims[0].bwidth ||
        0 == subdims[1].x ||
        0 == subdims[1].y ||
        0 == subdims[1].bwidth ){

        return false;
    }

    subgA = subdims[0].y/subdims[1].y;
    subgB = subdims[0].x/subdims[1].x;
    itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth;

    if( itemsPerSubg < 4 ){
        return false;
    }

    if( subdims[1].y < 4 ||
        subdims[1].x < 4 ||
        subdims[1].bwidth < 4 ){
        return false;
    }

    if( subdims[1].x != subdims[1].itemX ||
        subdims[1].y != subdims[1].itemY ){

        return false;
    }

    // the group block must consist of integer number of subgroup blocks
    if( subdims[0].x % subdims[1].x ||
        subdims[0].y % subdims[1].y ||
        subdims[0].bwidth % subdims[1].bwidth ){

        return false;
    }

    //check fitting of bw to common vector sizes
    if( isComplexType(dtype) ){

        if( 2*subdims[1].bwidth > 16 ){

            return false;
        }
    }

    // check dimensions
    if( subdims[1].bwidth > 16 ||
        subdims[1].x > 16 ||
        subdims[1].y > 16 ){

        return false;
    }

    // estimate register usage, drop
    // inevitably slowed decompositions
    regUse =
        (   subdims[1].bwidth * subdims[1].x +
            subdims[1].bwidth * subdims[1].y +
            subdims[1].x * subdims[1].y ) *
        dtypeSize(dtype);

    regUse /= 16; // 16 bytes per register

    if( regUse >= 64 ){
        return false;
    }

    // passed PGranularity should be checked
    if( PGRAN_CHECK == check ){

        if( pgran->wgDim != 1 ){
            return false;
        }
        if( pgran->wgSize[0] != 64 ){
            return false;
        }

        if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){
            return false;
        }
    }
    // PGranularity should be calculated
    else{
        pgran->wgDim = 1;
        pgran->wgSize[0] = subgA * subgB * itemsPerSubg;
    }

    return true;
}